<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
    <front>
        <journal-meta>
            <journal-id journal-id-type="publisher-id">JMIR</journal-id>
            <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
            <journal-title>Journal of Medical Internet Research</journal-title>
            <issn pub-type="epub">1438-8871</issn>
            <publisher>
                <publisher-name>Gunther Eysenbach</publisher-name>
                <publisher-loc>Centre for Global eHealth Innovation, Toronto, Canada</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="publisher-id">v12i3e43</article-id>
            <article-id pub-id-type="pmid">20876049</article-id>
            <article-id pub-id-type="doi">10.2196/jmir.1323</article-id>
            <article-categories>
                <subj-group subj-group-type="article-type">
                    <subject>Original Paper</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Developing a Disease Outbreak Event Corpus</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="editor">
                    <name>
                        <surname>Eysenbach</surname>
                        <given-names>Gunther</given-names>
                    </name>
                </contrib>
            </contrib-group>
            <contrib-group>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Hirschman</surname>
                        <given-names>Lynette</given-names>
                    </name>
                </contrib>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Tolentino</surname>
                        <given-names>Herman</given-names>
                    </name>
                </contrib>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Freifeld</surname>
                        <given-names>Clark</given-names>
                    </name>
                </contrib>
            </contrib-group>
            <contrib-group>
                <contrib contrib-type="author" id="contrib1" corresp="yes">
                    <name name-style="western">
                        <surname>Conway</surname>
                        <given-names>Mike</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                    <address>
                        <institution>National Institute of Informatics</institution>
                        <addr-line>212 Hitotsubashi, Chiyoda-ku</addr-line>
                        <addr-line>Tokyo 101-8430</addr-line>
                        <country>Japan</country>
                        <phone>81 3 4212 2677</phone>
                        <fax>81 3 3556 1916</fax>
                        <email>mike@nii.ac.jp</email>
                    </address>
                </contrib>
                <contrib contrib-type="author" id="contrib2">
                    <name name-style="western">
                        <surname>Kawazoe</surname>
                        <given-names>Ai</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref ref-type="aff" rid="aff2">2</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib3">
                    <name name-style="western">
                        <surname>Chanlekha</surname>
                        <given-names>Hutchatai</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib4">
                    <name name-style="western">
                        <surname>Collier</surname>
                        <given-names>Nigel</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                </contrib>
            </contrib-group>
            <aff id="aff2" rid="aff2">
                <sup>2</sup>
                <institution>Center for Women in Research</institution>
                <institution>Tsuda College</institution>
                <addr-line>Tokyo</addr-line>
                <country>Japan</country>
            </aff>
            <aff id="aff1" rid="aff1">
                <sup>1</sup>
                <institution>National Institute of Informatics</institution>
                <addr-line>Tokyo</addr-line>
                <country>Japan</country>
            </aff>
            <pub-date pub-type="collection">
                <season>Jul-Sep</season>
                <year>2010</year>
            </pub-date>
            <pub-date pub-type="epub">
                <day>28</day>
                <month>09</month>
                <year>2010</year>
            </pub-date>
            <volume>12</volume>
            <issue>3</issue>
            <elocation-id>e43</elocation-id>
            <!--history from ojs - api-xml-->
            <history>
                <date date-type="received">
                    <day>28</day>
                    <month>07</month>
                    <year>2009</year>
                </date>
                <date date-type="rev-request">
                    <day>18</day>
                    <month>11</month>
                    <year>2009</year>
                </date>
                <date date-type="rev-recd">
                    <day>21</day>
                    <month>02</month>
                    <year>2010</year>
                </date>
                <date date-type="accepted">
                    <day>12</day>
                    <month>03</month>
                    <year>2010</year>
                </date>
            </history>
            <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
            <copyright-statement>&#169;Mike Conway, Ai Kawazoe, Hutchatai Chanlekha, Nigel Collier. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 28.09.2010 &#160;</copyright-statement>
            <copyright-year>2010</copyright-year>
            <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/2.0/">
                <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
            </license>
            <self-uri xlink:href="http://www.jmir.org/2010/3/e43/" xlink:type="simple" />
            <abstract>
                <sec>
                    <title>Background</title>
                    <p>In recent years, there has been a growth in work on the use of information extraction technologies for tracking disease outbreaks from online news texts, yet publicly available evaluation standards (and associated resources) for this new area of research have been noticeably lacking.</p>
                </sec>
                <sec>
                    <title>Objective</title>
                    <p>This study seeks to create a &#8220;gold standard&#8221; data set against which to test how accurately disease outbreak information extraction systems can identify the semantics of disease outbreak events. Additionally, we hope that the provision of an annotation scheme (and associated corpus) to the community will encourage open evaluation in this new and growing application area.</p>
                </sec>
                <sec>
                    <title>Methods</title>
                    <p>We developed an annotation scheme for identifying infectious disease outbreak events in news texts. An event&#9472;in the context of our annotation scheme&#9472;consists minimally of geographical (eg, country and province) and disease name information. However, the scheme also allows for the rich encoding of other domain salient concepts (eg, international travel, species, and food contamination).</p>
                </sec>
                <sec>
                    <title>Results</title>
                    <p>The work resulted in a 200-document corpus of event-annotated disease outbreak reports that can be used to evaluate the accuracy of event detection algorithms (in this case, for the BioCaster biosurveillance online news information extraction system). In the 200 documents, 394 distinct events were identified (mean 1.97 events per document, range 0-25 events per document). We also provide a download script and graphical user interface (GUI)-based event browsing software to facilitate corpus exploration.</p>
                </sec>
                <sec>
                    <title>Conclusion</title>
                    <p>In summary, we present an annotation scheme and corpus that can be used in the evaluation of disease outbreak event extraction algorithms. The annotation scheme and corpus were designed both with the particular evaluation requirements of the BioCaster system in mind as well as the wider need for further evaluation resources in this growing research area.</p>
                </sec>
            </abstract>
            <kwd-group>
                <kwd>Biosurveillance</kwd>
                <kwd>disease outbreaks</kwd>
                <kwd>natural language processing</kwd>
                <kwd>corpora</kwd>
                <kwd>text mining</kwd>
                <kwd>information extraction</kwd>
                <kwd>public health informatics</kwd>
            </kwd-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="introduction">
            <title>Introduction</title>
            <p>The need for computational tools for the tracking of emerging disease outbreaks from text has become increasingly important in recent years [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>] leading to the development of various machine-aided surveillance systems (eg, Global Public Health Intelligence Network (GPHIN) [<xref ref-type="bibr" rid="ref3">3</xref>], HealthMap [<xref ref-type="bibr" rid="ref4">4</xref>], BioCaster [<xref ref-type="bibr" rid="ref5">5</xref>], MedISys [<xref ref-type="bibr" rid="ref6">6</xref>], Pattern-based Understanding and Learning System (PULS) [<xref ref-type="bibr" rid="ref7">7</xref>], and EpiSPIDER[<xref ref-type="bibr" rid="ref8">8</xref>]). One way to evaluate the semantics of such a system is to construct an event frame (ie, template), which is then associated with each outbreak event in a sample of news documents (the nature and scope of reportable events varies according to the case definition of each system). This paper reports on such a data set&#9472;an annotation scheme and corpus [<xref ref-type="bibr" rid="ref9">9</xref>]&#9472;developed for disease outbreak event detection in the context of the BioCaster biosurvellance online news information extraction (IE) system [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p>
            <p>We believe that a focus on event extraction offers additional advantages to methods based solely on information retrieval (IR). Traditional IR systems allow us to identify reports based on the presence or absence of disease terms whereas event-based IE approaches enable us to dig deep into a report&#8217;s semantics. The mere presence of a disease term in a text should not necessarily lead us to the conclusion that the report contains pressing information about an outbreak. Indeed, Steinberger et al estimated that 63% of documents selected using traditional IR techniques do not contain outbreak events [<xref ref-type="bibr" rid="ref11">11</xref>]. For example, vaccination campaigns, medical research results, and public health advice often occur in news texts and are likely to generate false positives if we rely solely on IR to identify documents of interest. An event-based strategy facilitates the exclusion of nonrelevant documents from further processing and could form the basis of more sophisticated text mining and visualization while providing richer outbreak data for end users. Note that the event-based approach suggested here requires antecedent document selection and named entity recognition (NER) modules (ie, a pipeline with a document selection module inputting relevant documents to an NER module before this output is piped to an event extraction module). In the case of the BioCaster system, the document selection module has a particularly important &#8220;gate-keeping&#8221; function as the system accepts input from over 1700 RSS feeds&#9472;far too many documents to subject to the computationally intensive NER and event extraction processes [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
            <p>The event annotation scheme aims to identify each infectious disease outbreak event in a given text with its associated disease, time, location (at various levels of granularity), and other relevant information. An annotated corpus is necessary in order to evaluate the performance of the current BioCaster IE system and also serves as a test bed for the development of new biosurveillance-specific IE algorithms and techniques. Further, the provision of a reusable resource facilitates further work on disease event extraction and encourages the development of the field, as it has been shown that the provision of such resources (often in conjunction with organized &#8220;challenge evaluations&#8221; similar to, for instance, the Text Retrieval Conference (TREC) Genomics Track [<xref ref-type="bibr" rid="ref12">12</xref>]) has increased research momentum for other IE tasks [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
            <p>Previous work on evaluation for disease outbreak report IE systems has focused on disparate aspects of performance. For example, Blench [<xref ref-type="bibr" rid="ref14">14</xref>] found that the GPHIN system identified 56% of the outbreaks verified by the World Health Organization (WHO) over a three-year period, while Freifeld et al [<xref ref-type="bibr" rid="ref15">15</xref>] found that the HealthMap system successfully classified 84% of reports by disease and location over a one-month period. Kawazoe et al [<xref ref-type="bibr" rid="ref16">16</xref>] reported that the BioCaster system&#8217;s NER module achieved an F-score of 76.97, while for the PULS system (which is an event extraction system that relies on input from the MedISys IR system), it is estimated that approximately 72% of the extracted events are correct [<xref ref-type="bibr" rid="ref11">11</xref>]. While this kind of evaluation work is important for system developers, the obvious difficulty in comparing reported results illustrates the need for a community-wide data set for algorithm testing.</p>
            <p>The structure of the paper is as follows. First, we describe the event annotation scheme we developed, then, we set out agreement statistics before finally presenting a description of the corpus and associated software.</p>
        </sec>
        <sec>
            <title>Annotation Scheme</title>
            <p>Each document is associated with zero or more event frames reflecting the number of outbreak events described in the text (A full description of the annotation scheme, and all associated software can be downloaded from the project Google Code site [<xref ref-type="bibr" rid="ref9">9</xref>]). The event frames are designed to capture the properties of outbreak reports that are of interest to public health experts and epidemiologists. Event frames are formatted in extensible markup language (XML) (see <xref ref-type="fig" rid="figure1">Figure 1</xref>) and consist of property names and their associated values derived from the document source (eg, HAS_DISEASE, &#8220;Ebola&#8221;). Reports have already been tagged for named entities such as person names, disease names, location names, and so on (twelve in total) using an ontology-based annotation scheme developed specifically for the disease outbreak domain [<xref ref-type="bibr" rid="ref16">16</xref>]. Property names are of two types. First, entity properties are filled with appropriate entities derived directly from the text of interest (entity properties are conceptually similar to Message Understanding Conference (MUC) style &#8220;string fills&#8221;). For example, the HAS_DISEASE property could only have the value &#8220;polio&#8221; if &#8220;polio&#8221; is tagged as an entity in the document. Second, fixed slots (equivalent to MUC-style &#8220;set fills&#8221;) take prespecified values of a restricted kind (normally simply Boolean true or false values), and, unlike entity values, are <italic>inferred</italic> from the document. For example, the INTERNATIONAL_TRAVEL property accepts only Boolean values.</p>
            <p>The following are the entity properties (which are filled by named entities) and their definitions:</p>
            <list list-type="bullet">
                <list-item>
                    <p>HAS_DISEASE: disease that caused the outbreak (eg, Ebola)</p>
                </list-item>
                <list-item>
                    <p>HAS_LOCATION.COUNTRY: country where the outbreak occurred (eg, United States, Indonesia)</p>
                </list-item>
                <list-item>
                    <p>HAS_LOCATION.PROVINCE: province in which the outbreak occurred (eg, Kanagawa, New Hampshire)</p>
                </list-item>
                <list-item>
                    <p>HAS_LOCATION.OTHER: other geographical location (eg, Balkans, New England)</p>
                </list-item>
                <list-item>
                    <p>HAS_AGENT: agent (pathogen) of the disease (eg, HIV)</p>
                </list-item>
            </list>
            <p>The following are the &#8220;fixed&#8221; slots (which are inferred from the text and take prespecified values) and their definitions:</p>
            <list list-type="bullet">
                <list-item>
                    <p>HAS_SPECIES: human or non_human</p>
                </list-item>
                <list-item>
                    <p>TIME.relative: historical (more than three months ago), recent_past (between two weeks and three months ago), present (within the last two weeks), and hypothetical</p>
                </list-item>
            </list>
            <list list-type="bullet">
                <list-item>
                    <p>ZOONOSIS: has species transfer occurred? (Boolean)</p>
                </list-item>
                <list-item>
                    <p>DRUG_RESISTANCE: is the disease drug resistant? (Boolean)</p>
                </list-item>
                <list-item>
                    <p>NEW_TYPE_AGENT: is the disease a new strain? (Boolean)</p>
                </list-item>
                <list-item>
                    <p>ACCIDENTAL_RELEASE: has the disease been released accidently? (Boolean)</p>
                </list-item>
                <list-item>
                    <p>INTERNATIONAL_TRAVEL: is international travel involved? (Boolean)</p>
                </list-item>
                <list-item>
                    <p>FOOD_CONTAMINATION: is the outbreak caused by contaminated food or water? (Boolean)</p>
                </list-item>
                <list-item>
                    <p>HOSPITAL_WORKER: are any victims hospital workers? (Boolean)</p>
                </list-item>
                <list-item>
                    <p>FARM_WORKER: are any victims farm workers? (Boolean)</p>
                </list-item>
                <list-item>
                    <p>MALFORMED_PRODUCT: are contaminated blood products or vaccines implicated? (Boolean)</p>
                </list-item>
            </list>
            <p>A working group consisting of the current paper&#8217;s authors developed the annotation scheme over a period of several months guided by the World Health Organization International Health Regulations [<xref ref-type="bibr" rid="ref17">17</xref>] (see <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>) and using advice provided by the National Institute of Infectious Diseases in Japan.</p>
            <fig id="figure1" position="float">
                <label>Figure 1</label>
                <caption>
                    <p>Worked example of event frame construction from raw text. Note that this paper focuses on the construction of event frames from documents already tagged for named entities. The named entity tagging process is described by Kawazoe et al [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
                </caption>
                <!--Original graphic name: http://writer.zoho.com:80/image.do?imgurl=6d37aff7952c652b15981bdb67ae7d24753f78e991a900217e61b98366e8ce2e7924bf28d0fdd0982982e4acc6fd742c-->
                <graphic xlink:href="jmir_v12i3e43_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
            </fig>
            <boxed-text id="box1" position="float">
                <title>Using the World Health Organization international health regulations (Annex 2 decision instrument) as the basis for an annotation scheme</title>
                <p>
                    <bold>World Health Organization International Health Regulations (Annex 2 Decision Instrument)</bold>
                </p>
                <p>In developing our guidelines, we took inspiration from the World Health Organization's International Health Regulations Annex 2 decision instrument [<xref ref-type="bibr" rid="ref17">17</xref>] for building our own decision tree. At the top level, this included general questions such as, &#34;Is the topic of the article mainly about a current disease outbreak?&#34; and &#34;Are the victims of the disease mainly humans?&#34; These questions were designed to guide annotators in the most obvious stages of selection. At lower levels, the decision tree we developed touched on issues more central to the Annex 2 decision tree such as known notifiable diseases (eg, SARS, Smallpox, Poliomyelitis, Cholera, West Nile Virus). The notion of &#8220;unusual&#8221; or &#8220;unexpected&#8221; is underspecified in Annex 2 but would be apparent to a public health expert familiar with the field. We tried to make the notion explicit for our annotators by encoding questions about the virulence and infectivity of the pathogen, the severity of the reported cases, the involvement of international travel, and drug resistance or accidental/deliberate release.</p>
            </boxed-text>
        </sec>
        <sec>
            <title>Agreement Study and Error Analysis</title>
            <p>To gain insight into how consistently the scheme could be applied and to help pinpoint areas of systematic annotator error, we conducted a 100-document interannotator agreement study. We recruited and trained one annotator and compared that individual&#8217;s annotations with those of an annotator who was involved in the original annotation scheme design process.</p>
            <p>Following the recommendations of Wilbur et al [<xref ref-type="bibr" rid="ref18">18</xref>], we used percentage scores to assess agreement rather than the kappa statistic [<xref ref-type="bibr" rid="ref19">19</xref>]. While some researchers in annotation scheme design refrain from the use of agreement studies entirely (eg, [<xref ref-type="bibr" rid="ref20">20</xref>]), we felt that this exercise would help to draw out any systematic annotator difficulties and also facilitate the debugging of the annotation scheme and corpus.</p>
            <p>We found that the two annotators agreed on the number of disease outbreak events 67% of the time. However, calculating agreement at the level of individual properties (eg, TIME.relative) was not as straightforward as calculating event number agreement for the following three reasons: (1) Annotators could identify a differing number of events for a document. (2) Unless both annotators produced just 1 (or zero) event frames, we were faced with the problem of aligning events. (3) The annotation scheme allowed for an arbitrary number of property values, reflecting synonymous or near synonymous terms in the source document. For example, it was not unusual to see a property/value pairing such as HAS_DISEASE=&#8220;bird flu&#124;H5N1&#124;avian influenza.&#8221;</p>
            <p>Therefore, we concentrated our analysis on those 42 documents where only one event was identified per annotator, thus allowing for a direct comparison. These data are partially summarized in <xref ref-type="table" rid="table1">Table 1</xref>, where it can be seen that the annotators agreed 100% of the time on DRUG_RESISTANT, FARM_WORKER, INTERNATIONAL_TRAVEL, and PRODUCT_MALFORMATION. Agreement was worst for FOOD_CONTAMINATION and ZOONOSIS. Major sources of disagreement are summarized in <xref ref-type="boxed-text" rid="box2">Textbox 2</xref>.</p>
            <p>The fixed slot properties, TIME.relative and SPECIES, are not Boolean and therefore are not represented in <xref ref-type="table" rid="table1">Table 1</xref>. TIME.relative had four values (historical, recent past, present, and hypothetical) and achieved an agreement score of 90.5% (with the most frequent value being &#8220;present&#8221;). SPECIES had two values (human and nonhuman) and achieved an agreement of 90.2%. More information about the annotation guidelines is available in [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
            <p>The entity properties (eg, HAS_LOCATION.PROVINCE, HAS_DISEASE) were filled by tagged entities in the text. Agreement for HAS_DISEASE was 100% and for HAS_LOCATION.COUNTRY was 97.7%.</p>
            <table-wrap id="table1" position="float">
                <label>Table 1</label>
                <caption>
                    <p>Agreement for 42 documents with precisely one event per annotator (note that only Boolean fixed slot properties are shown)</p>
                </caption>
                <table cellpadding="8" cellspacing="0" border="1" rules="groups" frame="hsides" width="1000">
                    <col width="250" />
                    <col width="150" />
                    <col width="150" />
                    <col width="150" />
                    <col width="150" />
                    <col width="150" />
                    <thead>
                        <tr valign="top">
                            <td />
                            <td colspan="5">Agreement for Fixed Slot Properties in Each of 42 Documents</td>
                        </tr>
                        <tr valign="top">
                            <td>Property</td>
                            <td>Annotator 1 (true)</td>
                            <td>Annotator 1 (false)</td>
                            <td>Annotator 2 (true)</td>
                            <td>Annotator 2 (false)</td>
                            <td>Agreement (%)</td>
                        </tr>
                    </thead>
                    <tbody>
                        <tr valign="top">
                            <td>DRUG_RESISTANCE</td>
                            <td>0</td>
                            <td>42</td>
                            <td>0</td>
                            <td>42</td>
                            <td>100.0</td>
                        </tr>
                        <tr valign="top">
                            <td>FARM_WORKER</td>
                            <td>0</td>
                            <td>42</td>
                            <td>0</td>
                            <td>42</td>
                            <td>100.0</td>
                        </tr>
                        <tr valign="top">
                            <td>FOOD_CONTAMINATION</td>
                            <td>5</td>
                            <td>37</td>
                            <td>13</td>
                            <td>29</td>
                            <td>71.4</td>
                        </tr>
                        <tr valign="top">
                            <td>HOSPITAL_WORKER</td>
                            <td>0</td>
                            <td>42</td>
                            <td>1</td>
                            <td>41</td>
                            <td>97.6</td>
                        </tr>
                        <tr valign="top">
                            <td>INTERNATIONAL_TRAVEL</td>
                            <td>0</td>
                            <td>42</td>
                            <td>0</td>
                            <td>42</td>
                            <td>100.0</td>
                        </tr>
                        <tr valign="top">
                            <td>PRODUCT_MALFORMATION</td>
                            <td>0</td>
                            <td>42</td>
                            <td>0</td>
                            <td>42</td>
                            <td>100.0</td>
                        </tr>
                        <tr valign="top">
                            <td>ZOONOSIS</td>
                            <td>7</td>
                            <td>35</td>
                            <td>12</td>
                            <td>30</td>
                            <td>83.0</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <boxed-text id="box2" position="float">
                <title>Sources of disagreement</title>
                <p>
                    <bold>Event Agreement</bold>
                </p>
                <p>On detailed examination of the data, a systematic problem concerning event granularity emerged accounting for the relatively low 67% event agreement rate. Our analysis showed that the issue of suspected zoonosis (ie, unconfirmed zoonosis or where zoonosis is presented as one possible explanation for a human disease) was central here. One annotator produced two events (one human, one non_human), while the other annotator only produced one event (human), ignoring the suspicion of, or speculation about, zoonosis.</p>
                <p>
                    <bold>Annotator Error</bold>
                </p>
                <p>We can distinguish annotator agreement arising from ambiguity in the annotation guidelines from straightforward annotator mistakes. For instance, there are several examples where the temporal categories, present (within two weeks of the document time stamp) and recent_past (more than two weeks, but less than three months from the document time stamp), were confused.</p>
                <p>
                    <bold>Background Knowledge and Inference</bold>
                </p>
                <p>For those properties that require an annotator to infer a category from the document (TIME.relative, ZOONOSIS, HAS_SPECIES, INTERNATIONAL_TRAVEL, DRUG_RESISTANCE, FOOD_CONTAMINATION, HOSPITAL_WORKER, FARM_WORKER, and PRODUCT_MALFORMATION), there is scope for incorrect inference. For example, several of the documents in the agreement study data set concern cholera. While cholera is spread primarily through water contamination (ie, FOOD_CONTAMINATION), this is not stated explicitly in the text. Only one of the annotators marked these documents as true for FOOD_CONTAMINATION, suggesting that the annotator who marked the property false was unaware of the primary transmission route for cholera.</p>
            </boxed-text>
        </sec>
        <sec>
            <title>Corpus Description</title>
            <p>The corpus consists of 200 documents (all in English) and their associated event frames, with documents gathered from a variety of sources (see <xref ref-type="table" rid="table2">Table 2</xref>). The largest single source was ProMed-Mail [<xref ref-type="bibr" rid="ref21">21</xref>], an expert-curated infectious disease reporting service. Additionally, documents were not randomly sampled, but rather selected to represent diseases and geographical areas of interest to the researchers. Major international news providers are also represented (eg, CBC, Reuters, BBC) in addition to primarily Asian or Asia-Pacific news services (eg, Vietnam-net, Thailand&#8217;s The Nation). Documents range from 45 to 1487 words long, with a mean of 305.9 words (without markup). Document selection was performed by author MC (see corpus documentation [<xref ref-type="bibr" rid="ref9">9</xref>] for details).</p>
            <table-wrap id="table2" position="float">
                <label>Table 2</label>
                <caption>
                    <p>Corpus document sources (200 documents)</p>
                </caption>
                <table cellpadding="8" cellspacing="0" border="1" rules="groups" frame="hsides" width="1000">
                    <col width="500" />
                    <col width="250" />
                    <col width="250" />
                    <thead>
                        <tr valign="top">
                            <td>Document Source</td>
                            <td>Number of Documents</td>
                            <td>% of 200</td>
                        </tr>
                    </thead>
                    <tbody>
                        <tr valign="top">
                            <td>ProMed-Mail</td>
                            <td>43</td>
                            <td>21.5</td>
                        </tr>
                        <tr valign="top">
                            <td>Reuters</td>
                            <td>16</td>
                            <td>8.0</td>
                        </tr>
                        <tr valign="top">
                            <td>BBC</td>
                            <td>16</td>
                            <td>8.0</td>
                        </tr>
                        <tr valign="top">
                            <td>WHO</td>
                            <td>41</td>
                            <td>20.5</td>
                        </tr>
                        <tr valign="top">
                            <td>CBS</td>
                            <td>13</td>
                            <td>6.5</td>
                        </tr>
                        <tr valign="top">
                            <td>CBC</td>
                            <td>17</td>
                            <td>8.5</td>
                        </tr>
                        <tr valign="top">
                            <td>Vietnam-net</td>
                            <td>12</td>
                            <td>6.0</td>
                        </tr>
                        <tr valign="top">
                            <td>Hindustan Times</td>
                            <td>18</td>
                            <td>9.0</td>
                        </tr>
                        <tr valign="top">
                            <td>The Nation (Thailand)</td>
                            <td>9</td>
                            <td>4.5</td>
                        </tr>
                        <tr valign="top">
                            <td>All Africa</td>
                            <td>5</td>
                            <td>2.5</td>
                        </tr>
                        <tr valign="top">
                            <td>Xinhua (China)</td>
                            <td>5</td>
                            <td>2.5</td>
                        </tr>
                        <tr valign="top">
                            <td>Antara (Indonesia)</td>
                            <td>5</td>
                            <td>2.5</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <p>Of the 394 annotated events in the corpus, 75.4% describe human (rather than animal) disease events (see <xref ref-type="table" rid="table3">Table 3</xref>). Most of the events identified (81.5%) have been classified as present outbreaks, although historical, recent past, and hypothetical events are also represented. To show the geographical range of the documents selected, the geographical distribution of events (by country) is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. Note that the map does not show the actual distribution of disease events, but rather the geographical distribution of disease events in our corpus<italic>.</italic>
            </p>
            <table-wrap id="table3" position="float">
                <label>Table 3</label>
                <caption>
                    <p>Event statistics (total number of events is 394)</p>
                </caption>
                <table cellpadding="8" cellspacing="0" border="1" rules="groups" frame="hsides" width="1000">
                    <col width="500" />
                    <col width="250" />
                    <col width="250" />
                    <thead>
                        <tr valign="top">
                            <td>Type of Event</td>
                            <td>Number of Events</td>
                            <td>% of 394</td>
                        </tr>
                    </thead>
                    <tbody>
                        <tr valign="top">
                            <td>Events involving humans</td>
                            <td>297</td>
                            <td>75.4</td>
                        </tr>
                        <tr valign="top">
                            <td>Events involving food contamination</td>
                            <td>35</td>
                            <td>8.9</td>
                        </tr>
                        <tr valign="top">
                            <td>Events involving hospital workers</td>
                            <td>3</td>
                            <td>0.8</td>
                        </tr>
                        <tr valign="top">
                            <td>Events involving malformed products</td>
                            <td>2</td>
                            <td>0.5</td>
                        </tr>
                        <tr valign="top">
                            <td>Events classified as present</td>
                            <td>321</td>
                            <td>81.5</td>
                        </tr>
                        <tr valign="top">
                            <td>Events classified as historical</td>
                            <td>49</td>
                            <td>12.4</td>
                        </tr>
                        <tr valign="top">
                            <td>Events classified as recent_past</td>
                            <td>11</td>
                            <td>2.8</td>
                        </tr>
                        <tr valign="top">
                            <td>Events classified as hypothetical</td>
                            <td>13</td>
                            <td>3.3</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <fig id="figure2" position="float">
                <label>Figure 2</label>
                <caption>
                    <p>Distribution of disease events in our corpus by country (only countries with 2 or more events shown) (Map produced by GPS visualizer)</p>
                </caption>
                <!--Original graphic name: http://writer.zoho.com:80/image.do?imgurl=6d37aff7952c652b15981bdb67ae7d24753f78e991a900217e61b98366e8ce2e8fcdb9a7113f80af5ecc251eae746385-->
                <graphic xlink:href="jmir_v12i3e43_fig2.gif" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
            </fig>
            <p>While we hope that the event frame may form part of the foundation for a future standard, we recognize that there are challenges in achieving this goal (see <xref ref-type="boxed-text" rid="box3">Textbox 3</xref>). Further, due to copyright restrictions, we are unable to distribute the corpus directly. Instead we have provided two methods for corpus access. First, a download script (a Perl script that downloads and cleans all the documents from their original source on the Web and then associates them with event frames) and a graphical user interface (GUI) based event browser (see <xref ref-type="fig" rid="figure3">Figure 3</xref>) [<xref ref-type="bibr" rid="ref9">9</xref>]. Note that as of July 2009, only 176 of the original 200 documents were currently available online.</p>
            <boxed-text id="box3" position="float">
                <title>Barriers to general adoption of the event annotation scheme</title>
                <p>
                    <bold>Heterogeneous Systems and Requirements</bold>
                </p>
                <p>The current event frames may not be suitable for all needs. For some users, the knowledge required by event frames may be superfluous (eg, a system that is solely concerned with identifying cholera outbreak has no need for zoonosis information). For other users, the event frame may not encode enough information (eg, an event's certainty or uncertainty&#8212;unrepresented in our event frame&#8212;may be important for system designers). Indeed, it is conceivable that some users may suffer from both these problems. Nevertheless, we believe that our event scheme provides a foundation for potential future standards developments.</p>
                <p>
                    <bold>Agreement</bold>
                </p>
                <p>The current agreement level for number of events (67%) is not high. However, this result masks the fact that agreement for important entity properties such as HAS_LOCATION.COUNTRY and HAS_DISEASE is almost perfect.</p>
                <p>
                    <bold>Extending to New Genres</bold>
                </p>
                <p>The current scheme was designed for news text. It is not clear how well the scheme would extend to other, less formal genres that may contain information of interest (eg, blog postings and message boards).</p>
            </boxed-text>
            <fig id="figure3" position="float">
                <label>Figure 3</label>
                <caption>
                    <p>Linux BioCaster corpus event frame browsing tool [<xref ref-type="bibr" rid="ref9">9</xref>]</p>
                </caption>
                <!--Original graphic name: http://writer.zoho.com:80/image.do?imgurl=6d37aff7952c652b15981bdb67ae7d24753f78e991a900217e61b98366e8ce2e699ba71585e4179c8ea67e8052098e1a-->
                <graphic xlink:href="jmir_v12i3e43_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
            </fig>
            <p>In summary, we present an annotation scheme and corpus that can be used in the evaluation of disease outbreak event extraction algorithms. The annotation scheme and corpus are presented to the research community in the belief that such resources can help in the formation of an emerging standard for this rapidly growing research area.</p>
        </sec>
    </body>
    <back>
        <ack>
            <p>This work was partially funded by a Japanese Society for the Promotion of Science postdoctoral fellowship (author MC).</p>
        </ack>
        <fn-group>
            <fn fn-type="conflict">
                <p>None declared</p>
            </fn>
        </fn-group>
        <ref-list>
            <ref id="ref1">
                <label>1</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Eysenbach</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <article-title>Infodemiology and infoveillance: framework for an emerging set of public health informatics methods to analyze search, communication and publication behavior on the Internet</article-title>
                    <source>J Med Internet Res</source>
                    <year>2009</year>
                    <volume>11</volume>
                    <issue>1</issue>
                    <fpage>e11</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2009/1/e11/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1157</pub-id>
                    <pub-id pub-id-type="medline">19329408</pub-id>
                    <pub-id pub-id-type="pii">v11i1e11</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2762766</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Keller</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Blench</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Tolentino</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Freifeld</surname>
                            <given-names>CC</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mandl</surname>
                            <given-names>KD</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mawudeku</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Eysenbach</surname>
                            <given-names>G</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Brownstein</surname>
                            <given-names>JS</given-names>
                        </name>
                    </person-group>
                    <article-title>Use of unstructured event-based reports for global infectious disease surveillance</article-title>
                    <source>Emerg Infect Dis</source>
                    <year>2009</year>
                    <month>05</month>
                    <volume>15</volume>
                    <issue>5</issue>
                    <fpage>689</fpage>
                    <lpage>95</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.cdc.gov/eid/content/15/5/689.htm" />
                    </comment>
                    <pub-id pub-id-type="medline">19402953</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2687026</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <nlm-citation citation-type="web">
                    <collab>Public Health Agency of Canada</collab>
                    <source>Public Health Agency of Canada News Releases</source>
                    <year>2004</year>
                    <access-date>2010-08-17</access-date>
                    <comment>The Global Public Health Intelligence Network (GPHIN). <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.phac-aspc.gc.ca/media/nr-rp/2004/2004_gphin-rmispbk-eng.php">http://www.phac-aspc.gc.ca/media/nr-rp/2004/2004_gphin-rmispbk-eng.php</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5s2xP05dn</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <nlm-citation citation-type="web">
                    <source>HealthMap</source>
                    <access-date>2010-09-03</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.healthmap.org/en/">http://www.healthmap.org/en/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sTEpwxJV</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <nlm-citation citation-type="web">
                    <source>BioCaster</source>
                    <access-date>2010-09-03</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://biocaster.nii.ac.jp/">http://biocaster.nii.ac.jp/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sTEunLHH</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <nlm-citation citation-type="web">
                    <source>MedISys</source>
                    <access-date>2010-09-03</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://medusa.jrc.it/medisys/homeedition/all/home.html">http://medusa.jrc.it/medisys/homeedition/all/home.html</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sTF1s0bJ</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <nlm-citation citation-type="web">
                    <source>Pattern-based Understanding and Learning System (PULS)</source>
                    <access-date>2010-09-03</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://puls.cs.helsinki.fi/medical/">http://puls.cs.helsinki.fi/medical/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sTFDvYt2</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <nlm-citation citation-type="web">
                    <source>EpiSPIDER</source>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.epispider.org/">http://www.epispider.org/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5snMyW0n4</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <nlm-citation citation-type="web">
                    <source>BioCaster event corpus and tools</source>
                    <access-date>2010-09-03</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://code.google.com/p/becorpus/">http://code.google.com/p/becorpus/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sTMiCoF7</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Collier</surname>
                            <given-names>N</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Doan</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kawazoe</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Matsuda-Goodwin</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Conway</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Tateno</surname>
                            <given-names>Y</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Ngo</surname>
                            <given-names>QH</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Dien</surname>
                            <given-names>D</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kawtrakul</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Takeuchi</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Shigematsu</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Taniguchi</surname>
                            <given-names>K</given-names>
                        </name>
                    </person-group>
                    <article-title>BioCaster: detecting public health rumors with a Web-based text mining system</article-title>
                    <source>Bioinformatics</source>
                    <year>2008</year>
                    <month>12</month>
                    <day>15</day>
                    <volume>24</volume>
                    <issue>24</issue>
                    <fpage>2940</fpage>
                    <lpage>1</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://bioinformatics.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=18922806" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/btn534</pub-id>
                    <pub-id pub-id-type="medline">18922806</pub-id>
                    <pub-id pub-id-type="pii">btn534</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2639299</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Steinberger</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Fuart</surname>
                            <given-names>F</given-names>
                        </name>
                        <name name-style="western">
                            <surname>van der Groot</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Best</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>von Etter</surname>
                            <given-names>P</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Yangarber</surname>
                            <given-names>R</given-names>
                        </name>
                    </person-group>
                    <article-title>Text mining from the web for medical intelligence</article-title>
                    <source>Fogelman-Souli&#233; F, Perrotta D, Piskorski J, Steinberger R. editors. Mining Massive Data Sets for Security</source>
                    <year>2008</year>
                    <publisher-loc>Amsterdam, Netherlands</publisher-loc>
                    <publisher-name>OIS Press</publisher-name>
                    <fpage>295</fpage>
                    <lpage>310</lpage>
                </nlm-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <nlm-citation citation-type="web">
                    <source>TREC Genomics Track</source>
                    <access-date>2010-09-03</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://ir.ohsu.edu/genomics/">http://ir.ohsu.edu/genomics/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sTMt2HZa</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Hirschman</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Park</surname>
                            <given-names>JC</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Tsujii</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wong</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wu</surname>
                            <given-names>CH</given-names>
                        </name>
                    </person-group>
                    <article-title>Accomplishments and challenges in literature data mining for biology</article-title>
                    <source>Bioinformatics</source>
                    <year>2002</year>
                    <month>12</month>
                    <volume>18</volume>
                    <issue>12</issue>
                    <fpage>1553</fpage>
                    <lpage>61</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://bioinformatics.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=12490438" />
                    </comment>
                    <pub-id pub-id-type="medline">12490438</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Blench</surname>
                            <given-names>M</given-names>
                        </name>
                    </person-group>
                    <article-title>Global public health intelligence network (GPHIN)</article-title>
                    <conf-name>The eighth conference of the association for machine translation in the Americas</conf-name>
                    <conf-date>2008 Oct 21-25</conf-date>
                    <conf-loc>Wakiki, Hawaii</conf-loc>
                    <comment>
                        <ext-link xlink:type="simple" xlink:href="http://www.amtaweb.org/papers/4.02_Blench2008.pdf" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Freifeld</surname>
                            <given-names>CC</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mandl</surname>
                            <given-names>KD</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Reis</surname>
                            <given-names>BY</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Brownstein</surname>
                            <given-names>JS</given-names>
                        </name>
                    </person-group>
                    <article-title>HealthMap: global infectious disease monitoring through automated classification and visualization of Internet media reports</article-title>
                    <source>J Am Med Inform Assoc</source>
                    <year>2008</year>
                    <month>04</month>
                    <volume>15</volume>
                    <issue>2</issue>
                    <fpage>150</fpage>
                    <lpage>7</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.ncbi.nlm.nih.gov/pmc/articles/pmid/18096908/?tool=pubmed" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1197/jamia.M2544</pub-id>
                    <pub-id pub-id-type="medline">18096908</pub-id>
                    <pub-id pub-id-type="pii">M2544</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2274789</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Kawazoe</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Jin</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Shigematsu</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Barrero</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Taniguchi</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Coller</surname>
                            <given-names>N</given-names>
                        </name>
                    </person-group>
                    <article-title>The development of a schema for the annotation of terms in the BioCaster disease detecting/tracking system</article-title>
                    <conf-name>KR-MED 2006: Biomedical ontology in action</conf-name>
                    <conf-date>2006 Nov 8</conf-date>
                    <conf-loc>Baltimore, Maryland</conf-loc>
                    <comment>
                        <ext-link xlink:type="simple" xlink:href="http://sunsite.informatik.rwth-aachen.de/Publications/CEUR-WS/Vol-222/krmed2006-p09.pdf" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <collab>World Health Organization</collab>
                    </person-group>
                    <source>International Health Regulations. 2nd edition</source>
                    <year>2005</year>
                    <access-date>2010-09-03</access-date>
                    <publisher-loc>Geneva, Switzerland</publisher-loc>
                    <publisher-name>World Health Organization</publisher-name>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://whqlibdoc.who.int/publications/2008/9789241580410_eng.pdf">http://whqlibdoc.who.int/publications/2008/9789241580410_eng.pdf</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sTNW7ABt</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Wilbur</surname>
                            <given-names>WJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Rzhetsky</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Shatkay</surname>
                            <given-names>H</given-names>
                        </name>
                    </person-group>
                    <article-title>New directions in biomedical text annotation: definitions, guidelines and corpus construction</article-title>
                    <source>BMC Bioinformatics</source>
                    <year>2006</year>
                    <volume>7</volume>
                    <fpage>356</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1471-2105/7/356" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1186/1471-2105-7-356</pub-id>
                    <pub-id pub-id-type="medline">16867190</pub-id>
                    <pub-id pub-id-type="pii">1471-2105-7-356</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1559725</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Carletta</surname>
                            <given-names>J</given-names>
                        </name>
                    </person-group>
                    <article-title>Assessing agreement on classification tasks: The kappa statistic</article-title>
                    <source>Computational Linguistics</source>
                    <year>1996</year>
                    <volume>22</volume>
                    <issue>2</issue>
                    <fpage>249</fpage>
                    <lpage>254</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.aclweb.org/anthology-new/J/J96/J96-2004.pdf" />
                    </comment>
                    <pub-id pub-id-type="other">5soWBbLfV</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Pyysalo</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Ginter</surname>
                            <given-names>F</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Heimonen</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Björne</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Boberg</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Järvinen</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Salakoski</surname>
                            <given-names>T</given-names>
                        </name>
                    </person-group>
                    <article-title>BioInfer: a corpus for information extraction in the biomedical domain</article-title>
                    <source>BMC Bioinformatics</source>
                    <year>2007</year>
                    <volume>8</volume>
                    <fpage>50</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1471-2105/8/50" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1186/1471-2105-8-50</pub-id>
                    <pub-id pub-id-type="medline">17291334</pub-id>
                    <pub-id pub-id-type="pii">1471-2105-8-50</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1808065</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <nlm-citation citation-type="web">
                    <source>ProMED-mail</source>
                    <access-date>2010-08-17</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.promedmail.org/pls/apex/f?p=2400:1000">http://www.promedmail.org/pls/apex/f?p=2400:1000</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5s2x3UEO8</pub-id>
                </nlm-citation>
            </ref>
        </ref-list>
        <glossary>
            <title>Abbreviations</title>
            <def-list>
                <def-item>
                    <term id="abb2">GPHIN</term>
                    <def>
                        <p>Global Public Health Intelligence Network</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb3">GUI</term>
                    <def>
                        <p>graphical user interface</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb4">IE</term>
                    <def>
                        <p>information extraction</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb5">IR</term>
                    <def>
                        <p>information retrieval</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb6">MUC</term>
                    <def>
                        <p>message understanding conference</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb7">NER</term>
                    <def>
                        <p>named entity recognition</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb8">PULS</term>
                    <def>
                        <p>Pattern-based Understanding and Learning System</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb9">TREC</term>
                    <def>
                        <p>Text Retrieval Conference</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb10">WHO</term>
                    <def>
                        <p>World Health Organization</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb11">XML</term>
                    <def>
                        <p>extensible markup language</p>
                    </def>
                </def-item>
            </def-list>
        </glossary>
    </back>
</article>
