<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v21i1e10013</article-id>
    <article-id pub-id-type="pmid">30622098</article-id>
    <article-id pub-id-type="doi">10.2196/10013</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Application of Efficient Data Cleaning Using Text Clustering for Semistructured Medical Reports to Large-Scale Stool Examination Reports: Methodology Study</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Eysenbach</surname>
          <given-names>Gunther</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Raghupathi</surname>
          <given-names>Wullianallur</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Barteit</surname>
          <given-names>Sandra</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Shen</surname>
          <given-names>Feichen</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Wang</surname>
          <given-names>Fusheng</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1" equal-contrib="yes">
        <name name-style="western">
          <surname>Woo</surname>
          <given-names>Hyunki</given-names>
        </name>
        <degrees>BS</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-4868-6270</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib2" equal-contrib="yes">
        <name name-style="western">
          <surname>Kim</surname>
          <given-names>Kyunga</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <xref rid="aff2" ref-type="aff">2</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-0865-2236</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib3">
        <name name-style="western">
          <surname>Cha</surname>
          <given-names>KyeongMin</given-names>
        </name>
        <degrees>MS</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-9868-9253</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib4">
        <name name-style="western">
          <surname>Lee</surname>
          <given-names>Jin-Young</given-names>
        </name>
        <degrees>MD, PhD</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-7420-3287</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib5">
        <name name-style="western">
          <surname>Mun</surname>
          <given-names>Hansong</given-names>
        </name>
        <degrees>MD, PhD</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-4718-1788</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib6">
        <name name-style="western">
          <surname>Cho</surname>
          <given-names>Soo Jin</given-names>
        </name>
        <degrees>MD</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-4361-4988</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib7">
        <name name-style="western">
          <surname>Chung</surname>
          <given-names>Ji In</given-names>
        </name>
        <degrees>MD, PhD</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-0894-3575</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib8">
        <name name-style="western">
          <surname>Pyo</surname>
          <given-names>Jeung Hui</given-names>
        </name>
        <degrees>MD</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-9992-9367</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib9">
        <name name-style="western">
          <surname>Lee</surname>
          <given-names>Kun-Chul</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff4" ref-type="aff">4</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-7636-8114</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib10" corresp="yes">
      <name name-style="western">
        <surname>Kang</surname>
        <given-names>Mira</given-names>
      </name>
      <degrees>MD, PhD</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <xref rid="aff3" ref-type="aff">3</xref>
      <address>
        <institution>Center for Health Promotion</institution>
        <institution>Samsung Medical Center</institution>
        <institution>Sungkyunkwan University School of Medicine</institution>
        <addr-line>81 Irwon-ro, Gangnam-gu</addr-line>
        <addr-line>Seoul, 06351</addr-line>
        <country>Republic of Korea</country>
        <phone>82 2 3410 3882</phone>
        <fax>82 2 3410 0054</fax>
        <email>mira90.kang@samsung.com</email>
      </address>  
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-7842-0035</ext-link></contrib>
    </contrib-group>
    <aff id="aff1">
    <label>1</label>
    <institution>Department of Digital Health</institution>
    <institution>Samsung Advanced Institute for Health Sciences &#38; Technology</institution>  
    <institution>Sungkyunkwan University</institution>  
    <addr-line>Seoul</addr-line>
    <country>Republic of Korea</country></aff>
    <aff id="aff2">
    <label>2</label>
    <institution>Statistics and Data Center</institution>
    <institution>Research Institute for Future Medicine</institution>  
    <institution>Samsung Medical Center</institution>  
    <addr-line>Seoul</addr-line>
    <country>Republic of Korea</country></aff>
    <aff id="aff3">
    <label>3</label>
    <institution>Center for Health Promotion</institution>
    <institution>Samsung Medical Center</institution>  
    <institution>Sungkyunkwan University School of Medicine</institution>  
    <addr-line>Seoul</addr-line>
    <country>Republic of Korea</country></aff>
    <aff id="aff4">
      <label>4</label>
      <institution>Jason TG</institution>
      <addr-line>Seoul</addr-line>
      <country>Republic of Korea</country>
    </aff>
    <author-notes>
      <corresp>Corresponding Author: Mira Kang 
      <email>mira90.kang@samsung.com</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><month>01</month><year>2019</year></pub-date>
    <pub-date pub-type="epub">
      <day>08</day>
      <month>01</month>
      <year>2019</year>
    </pub-date>
    <volume>21</volume>
    <issue>1</issue>
    <elocation-id>e10013</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>6</day>
        <month>2</month>
        <year>2018</year>
      </date>
      <date date-type="rev-request">
        <day>30</day>
        <month>7</month>
        <year>2018</year>
      </date>
      <date date-type="rev-recd">
        <day>23</day>
        <month>9</month>
        <year>2018</year>
      </date>
      <date date-type="accepted">
        <day>12</day>
        <month>10</month>
        <year>2018</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Hyunki Woo, Kyunga Kim, KyeongMin Cha, Jin-Young Lee, Hansong Mun, Soo Jin Cho, Ji In Chung, Jeung Hui Pyo, Kun-Chul Lee, Mira Kang. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 08.01.2019.</copyright-statement>
    <copyright-year>2018</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="https://www.jmir.org/2019/1/e10013/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>Since medical research based on big data has become more common, the community’s interest and effort to analyze a large amount of semistructured or unstructured text data, such as examination reports, have rapidly increased. However, these large-scale text data are often not readily applicable to analysis owing to typographical errors, inconsistencies, or data entry problems. Therefore, an efficient data cleaning process is required to ensure the veracity of such data.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>In this paper, we proposed an efficient data cleaning process for large-scale medical text data, which employs text clustering methods and value-converting technique, and evaluated its performance with medical examination text data.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>The proposed data cleaning process consists of text clustering and value-merging. In the text clustering step, we suggested the use of key collision and nearest neighbor methods in a complementary manner. Words (called values) in the same cluster would be expected as a correct value and its wrong representations. In the value-converting step, wrong values for each identified cluster would be converted into their correct value. We applied these data cleaning process to 574,266 stool examination reports produced for parasite analysis at Samsung Medical Center from 1995 to 2015. The performance of the proposed process was examined and compared with data cleaning processes based on a single clustering method. We used OpenRefine 2.7, an open source application that provides various text clustering methods and an efficient user interface for value-converting with common-value suggestion.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>A total of 1,167,104 words in stool examination reports were surveyed. In the data cleaning process, we discovered 30 correct words and 45 patterns of typographical errors and duplicates. We observed high correction rates for words with typographical errors (98.61%) and typographical error patterns (97.78%). The resulting data accuracy was nearly 100% based on the number of total words.</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>Our data cleaning process based on the combinatorial use of key collision and nearest neighbor methods provides an efficient cleaning of large-scale text data and hence improves data accuracy.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>data cleaning</kwd>
      <kwd>text clustering</kwd>
      <kwd>key collision</kwd>
      <kwd>nearest neighbor methods</kwd>
      <kwd>OpenRefine</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>In all of the industries, including the medical field, complex and diverse (structured, semistructured, unstructured) data have been growing dramatically for decades [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Although most health data have been digitalized, it is still not easy to handle medical records such as examination reports or physician’s notes because they are historically based on paper records and generated data mainly in semistructured or unstructured forms. In addition, they may contain a variety of nonidentical duplicates, typographical errors, inconsistencies, and data entry problems [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      <p>High performance analysis requires clean and high-quality data to yield reliable results [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. Therefore, efficient data cleaning takes precedence to improve the quality of data and obtain accurate analysis results [<xref ref-type="bibr" rid="ref12">12</xref>]. However, researchers are commonly faced with many obstacles in transforming the data into a clean and high-quality dataset owing to diverse patterns of typographical errors and duplicates.</p>
      <p>For text analysis of semistructured or unstructured data, we can use a paid program such as SAS Content Categorization (SAS Institute Inc) or IBM Watson Content Analytics (IBM) [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. However, these programs are very expensive and are not readily available to individual researchers because they are mainly sold to companies or research groups. Also, these programs require extensive practice and experience.</p>
      <p>Data cleaning using Excel’s “remove duplicates” function has been done before, but it is mostly unpractical to clean the data using Excel tools. Some of the nonidentical duplicates still remain because they are not recognized as duplicates when special characters or punctuations appear [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Duplicate detection tools such as the Febrl system, TAILOR, and BigMatch were also used in cleaning data. However, Febrl has usability limitations such as slowness, unclear error messages, and complicated installations [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. The listed programs are rather complex to the average users who do not have experience with programming and language functions.</p>
      <p>Many researchers who interpret and clean the local datasets are domain experts and are not familiar with the programming language [<xref ref-type="bibr" rid="ref21">21</xref>]. Thus, researchers need user friendly cleaning tools. OpenRefine can identify all types of strings and remove duplicates without the difficulties of programming and is a free, open source tool. OpenRefine contains the following 2 clustering methods: key collision methods and nearest neighbor methods. We proposed a data cleaning process using both text clustering methods in OpenRefine to improve accuracy of semistructured data.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>We performed data cleaning of 574,266 stool examination reports conducted at Samsung Medical Center from 1995 to 2015. Data for this study were extracted from DARWIN-C, the clinical data warehouse of Samsung Medical Center. According to the data cleaning process proposed in <xref ref-type="fig" rid="figure1">Figure 1</xref>, we conducted data cleaning by clustering and merging parasite names and investigated its performance.</p>
      <p>As described in <xref ref-type="fig" rid="figure1">Figure 1</xref>, the proposed data cleaning process consists of the following 4 steps: preprocess, text facet, systematic cleaning, and manual cleaning. In the preprocess, only names related to parasites (ie, helminth or protozoa) in raw text data were extracted using the regular expression functions of STATA MP 14.2 version [<xref ref-type="bibr" rid="ref22">22</xref>]. The extracted words were then uploaded on OpenRefine 2.7. In the text facet step, the number of occurrences was browsed for each word.</p>
      <p>The systematic cleaning step consists of text clustering and value-merging. Two clustering methods (ie, key collision and nearest neighbor) are used in a complementary manner to identify word clusters, each of which is expected to contain a correct word and its wrong representations with diverse forms of typographical errors (called “wrong values”). Key collision methods work by creating an alternate representation of a key that contains only the most significant or meaningful parts of a string and by clustering different strings together based on the same key. Because key collision methods are fast and simple in a variety of contexts, they have been often used for text clustering. We sequentially used 4 key collision methods including fingerprint, N-gram fingerprint, Metaphone3, and Cologne phonetic in OpenRefine. Nearest neighbor methods (also known as kNN) are widely used for clustering as well. These methods are slower but more accurate because they calculate the distance between each value. We sequentially used two nearest neighbor methods, the Levenshtein distance method and Prediction by Partial Matching method in OpenRefine. We combined both methods to enhance the accuracy [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
      <p>For each identified cluster, the wrong values are converted to their correct word by value-merging. Because OpenRefine provides a convenient user interface that lists the correct word and its wrong values in each cluster in descending order of occurrence frequency, researchers can easily recognize the correct word and conduct the value-merging task. For “<italic>Clonorchis sinensis</italic>” in stool examination report data, a variety of wrong expressions were noticed in the same cluster, such as clonorchis sinesis, clnorchis sinensis, clonorchis cinensis, clonrchis sinensis, and clornorchis sinensis (<xref ref-type="fig" rid="figure2">Figure 2</xref>). By looking at the word list, we were able to efficiently choose “<italic>Clonorchis sinensis</italic>” as the correct word and make a quick decision to convert all the others to “<italic>Clonorchis sinensis</italic>” In the final step, the remaining words that did not belong to any cluster were investigated and manually cleaned when necessary.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Flow chart of our data cleaning process. PPM: prediction by partial matching.</p>
        </caption>
        <graphic xlink:href="jmir_v21i1e10013_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <fig id="figure2" position="float">
        <label>Figure 2</label>
        <caption>
          <p>Representative screenshot of OpenRefine interface used for value-merging task.</p>
        </caption>
        <graphic xlink:href="jmir_v21i1e10013_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>A total of 1,167,104 words in 574,266 stool examination reports were surveyed, and words not related to the names of helminth or protozoa were excluded from the study. We discovered 30 correct words and 45 patterns of typographical errors and duplicates (<xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>). The key collision methods were able to cluster the patterns of typographical errors and duplicates with the correct word except for 6 patterns. The nearest neighbor methods were able to cluster the patterns of typographical errors and duplicates with the correct word except for 2 patterns (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>List of typographical errors that could not be clustered with the correct word by each method.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="550"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <thead>
            <tr valign="top">
              <td>Correct word</td>
              <td>Typographical error</td>
              <td>Key collision</td>
              <td>Nearest neighbor</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td rowspan="2">Negative</td>
              <td><list list-type="bullet"><list-item><p>Native</p></list-item></list></td>
              <td>✗<sup>a</sup></td>
              <td>✗</td>
            </tr>
            <tr valign="top">
              <td><list list-type="bullet"><list-item><p>Negaitve</p></list-item></list></td>
              <td>✓<sup>b</sup></td>
              <td>✗</td>
            </tr>
            <tr valign="top">
              <td rowspan="2">Endolimax</td>
              <td><list list-type="bullet"><list-item><p>Eolimax</p></list-item></list></td>
              <td>✗</td>
              <td>✓</td>
            </tr>
            <tr valign="top">
              <td><list list-type="bullet"><list-item><p>Endolix</p></list-item></list></td>
              <td>✗</td>
              <td>✓</td>
            </tr>
            <tr valign="top">
              <td>Entamoeba</td>
              <td><list list-type="bullet"><list-item><p>Etamoeba</p></list-item></list></td>
              <td>✗</td>
              <td>✓</td>
            </tr>
            <tr valign="top">
              <td rowspan="2">Lamblia</td>
              <td><list list-type="bullet"><list-item><p>Lamdlia</p></list-item></list></td>
              <td>✗</td>
              <td>✓</td>
            </tr>
            <tr valign="top">
              <td><list list-type="bullet"><list-item><p>G.lamblia</p></list-item></list></td>
              <td>✗</td>
              <td>✓</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>✗: Typographical error is not clustered with correct word.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>✓: Typographical error is clustered with correct word.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Correction rates by each method.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="540"/>
          <col width="230"/>
          <col width="230"/>
          <thead>
            <tr valign="top">
              <td>Method</td>
              <td>Correction rate by the number of typographical error patterns<sup>a</sup>, %</td>
              <td>Correction rate by the number of typographical error words<sup>b</sup>, %</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Key collision</td>
              <td>86.67</td>
              <td>91.67</td>
            </tr>
            <tr valign="top">
              <td>Nearest neighbor</td>
              <td>95.56</td>
              <td>97.22</td>
            </tr>
            <tr valign="top">
              <td>Using both</td>
              <td>97.78</td>
              <td>98.61</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup>The number of corrected typographical error patterns divided by the total number of typographical error patterns multipled by 100 (%).</p>
          </fn>
          <fn id="table2fn2">
            <p><sup>b</sup>The number of corrected typographical error words divided by the total number of typographical error words multipled by 100 (%).</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>The word “native” was the only pattern not clustered as “negative” out of all typographical errors by any clustering method because of the high inconsistency rate of the 2 words (2/6 characters, 33%). All typographical errors and duplicates except “native” were clustered correctly. We achieved a high correction rate of 98.61% by the number of typographical error words and 97.78% by the number of typographical error patterns when using both clustering methods (<xref ref-type="table" rid="table2">Table 2</xref>). After systematic data cleaning of 1,167,104 words, only 1 word with a typographical error remained and was revised manually. Thus, the accuracy of systematic data cleaning was nearly 100% based on the number of total words.</p>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>Many researchers have made great efforts to study data analytics methodology, but there have been relatively few studies on data cleaning methodology for unexpected typographical errors [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. It is rare to find a report that quantitatively analyzes the performance of data cleaning methods because they are often undocumented and used in nonofficial ways [<xref ref-type="bibr" rid="ref24">24</xref>]. In this study, we suggested an efficient way of data cleaning for large-scale medical text data and investigated its cleaning performance. Although several methods of text analysis exist, it is not easy for general researchers to use these methods. Most methods are not readily available or have limitations in usability. Therefore, there is a need for more feasible and user friendly methods for cleaning large-scale text datasets.</p>
      <p>We employed OpenRefine for data cleaning because of the following advantages. First, individual researchers can easily access and use OpenRefine because it is a free and open source tool. Second, OpenRefine provides researchers with an easy interface to clean the data without difficulties of programming. Third, one can easily fix rare typographical errors (which are not automatically corrected) manually and have the opportunity to modify false positive clustering [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref23">23</xref>].</p>
      <p>However, we still need much effort to review each clustering result and decide whether to merge, especially in cases where the number of clustering is extremely large. In addition, formal technical support for OpenRefine is not available, and it is supported by user forums or communities. Despite these limitations, OpenRefine is a useful and effective support tool for labor-intensive and time-consuming data cleaning of semistructured data.</p>
      <p>Our data cleaning process can be applied to other types of semistructured text data because we observed that the combinatorial use of key collision and nearest neighbor methods resulted in efficient and reliable data cleaning.</p>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>Patterns of parasite names in stool examination reports.</p>
        <media xlink:href="jmir_v21i1e10013_app1.pdf" xlink:title="PDF File (Adobe PDF File), 39KB"/>
      </app>
    </app-group>
    <ack>
      <p>This study was supported by Samsung Medical Center grant (SMX1170601).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Qiu</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Tsai</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Hassan</surname>
            <given-names>MM</given-names>
          </name>
          <name name-style="western">
            <surname>Alamri</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Health-CPS: Healthcare Cyber-Physical System Assisted by Cloud and Big Data</article-title>
        <source>IEEE Systems Journal</source>  
        <year>2017</year>  
        <month>3</month>  
        <volume>11</volume>  
        <issue>1</issue>  
        <fpage>88</fpage>  
        <lpage>95</lpage>  
        <pub-id pub-id-type="doi">10.1109/Jsyst.2015.2460747</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Das</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Kumar</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Big data analytics: A framework for unstructured data analysis</article-title>
        <source>Int J Eng Sci Technol</source>  
        <year>2013</year>  
        <volume>20135</volume>  
        <issue>1</issue>  
        <fpage>A</fpage> </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Tsai</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Lai</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Chiang</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>LT</given-names>
          </name>
        </person-group>
        <article-title>Data Mining for Internet of Things: A Survey</article-title>
        <source>IEEE Commun. Surv. Tutorials</source>  
        <year>2014</year>  
        <volume>16</volume>  
        <issue>1</issue>  
        <fpage>77</fpage>  
        <lpage>97</lpage>  
        <pub-id pub-id-type="doi">10.1109/Surv.2013.103013.00206</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Raghupathi</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Raghupathi</surname>
            <given-names>V</given-names>
          </name>
        </person-group>
        <article-title>Big data analytics in healthcare: promise and potential</article-title>
        <source>Health Inf Sci Syst</source>  
        <year>2014</year>  
        <volume>2</volume>  
        <fpage>3</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://hissjournal.biomedcentral.com/articles/10.1186/2047-2501-2-3"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1186/2047-2501-2-3</pub-id>
        <pub-id pub-id-type="medline">25825667</pub-id>
        <pub-id pub-id-type="pii">14</pub-id>
        <pub-id pub-id-type="pmcid">PMC4341817</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Groves</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Beyond Excel: how to start cleaning data with OpenRefine</article-title>
        <source>Multimedia Information and Technology</source>  
        <year>2016</year>  
        <volume>201642</volume>  
        <issue>2</issue>  
        <fpage>18</fpage>  
        <lpage>22</lpage> </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ham</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>OpenRefine (version 2.5). http://openrefine.org. Free, open-source tool for cleaning and transforming data</article-title>
        <source>J Med Libr Assoc</source>  
        <year>2013</year>  
        <month>7</month>  
        <volume>101</volume>  
        <issue>3</issue>  
        <fpage>233</fpage>  
        <lpage>234</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://openrefine"/>
        </comment>  
        <pub-id pub-id-type="doi">10.3163/1536-5050.101.3.020</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Gallant</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Lorang</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Ramirez</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <source>Tools for the digital humanities: a librarian's guide</source>  
        <year>2014</year>  
        <access-date>2018-11-14</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://mospace.umsystem.edu/xmlui/bitstream/handle/10355/44544/ToolsForTheDigitalHumanities.pdf?sequence=1">https://mospace.umsystem.edu/xmlui/bitstream/handle/10355/44544/ToolsForTheDigitalHumanities.pdf?sequence=1</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="73uqoKFfO"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chu</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>Ilyas</surname>
            <given-names>IF</given-names>
          </name>
        </person-group>
        <article-title>Qualitative data cleaning</article-title>
        <source>Proc. VLDB Endow</source>  
        <year>2016</year>  
        <month>09</month>  
        <day>01</day>  
        <volume>9</volume>  
        <issue>13</issue>  
        <fpage>1605</fpage>  
        <lpage>1608</lpage>  
        <pub-id pub-id-type="doi">10.14778/3007263.3007320</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Jones</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Big Data Analytics for Disparate Data</article-title>
        <source>American Journal of Intelligent Systems</source>  
        <year>2017</year>  
        <volume>20177</volume>  
        <issue>2</issue>  
        <fpage>39</fpage>  
        <lpage>46</lpage>  
        <pub-id pub-id-type="doi">10.5923/j.ajis.20170702.01</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Q</given-names>
          </name>
        </person-group>
        <article-title>Data preparation for data mining</article-title>
        <source>Appl Artif Intell 2003 May-Jun;17(5-6)</source>  
        <year>2003</year>  
        <fpage>375</fpage>  
        <lpage>381</lpage>  
        <pub-id pub-id-type="doi">10.1080/08839510390219264</pub-id></nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Anagnostopoulos</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Zeadally</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Exposito</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Handling big data: research challenges and future directions</article-title>
        <source>J Supercomput</source>  
        <year>2016</year>  
        <month>2</month>  
        <day>25</day>  
        <volume>72</volume>  
        <issue>4</issue>  
        <fpage>1494</fpage>  
        <lpage>1516</lpage>  
        <pub-id pub-id-type="doi">10.1007/s11227-016-1677-z</pub-id>
        <pub-id pub-id-type="medline">26811110</pub-id></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Rahm</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Do</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Data cleaning: Problems and current approaches</article-title>
        <source>IEEE Data Eng Bull</source>  
        <year>2000</year>  
        <volume>200023</volume>  
        <issue>4</issue>  
        <fpage>3</fpage>  
        <lpage>13</lpage> </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chakraborty</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Pagolu</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Garla</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <source>Text mininganalysis: practical methods, examples,case studies using SAS</source>  
        <year>2014</year>  
        <access-date>2018-11-14</access-date>
        <publisher-loc>Cary, NC</publisher-loc>
        <publisher-name>SAS Institute</publisher-name>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://support.sas.com/publishing/pubcat/chaps/65646.pdf">http://support.sas.com/publishing/pubcat/chaps/65646.pdf</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="73uraoski"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhu</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Foyle</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Gagné</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Gupta</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Magdalen</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Mundi</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <source>IBM Watson Content Analytics: Discovering Actionable Insight from Your Content</source>  
        <year>2014</year>  
        <publisher-loc>New York, USA</publisher-loc>
        <publisher-name>IBM Redbooks</publisher-name></nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Katsanevakis</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Gatto</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Zenetos</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Cardoso</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <source>Management of Biological Invasions</source>  
        <year>2013</year>  
        <access-date>2018-11-14</access-date>
        <comment>How many marine aliens in Europe 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://pdfs.semanticscholar.org/4dbe/0bc865391bd3a6100e112e7046675341ba18.pdf">https://pdfs.semanticscholar.org/4dbe/0bc865391bd3a6100e112e7046675341ba18.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="73uswuwIP"/></comment> </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Dallas</surname>
            <given-names>DC</given-names>
          </name>
          <name name-style="western">
            <surname>Guerrero</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Khaldi</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Borghese</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Bhandari</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Underwood</surname>
            <given-names>MA</given-names>
          </name>
          <name name-style="western">
            <surname>Lebrilla</surname>
            <given-names>CB</given-names>
          </name>
          <name name-style="western">
            <surname>German</surname>
            <given-names>JB</given-names>
          </name>
          <name name-style="western">
            <surname>Barile</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>A peptidomic analysis of human milk digestion in the infant stomach reveals protein-specific degradation patterns</article-title>
        <source>J Nutr</source>  
        <year>2014</year>  
        <month>06</month>  
        <volume>144</volume>  
        <issue>6</issue>  
        <fpage>815</fpage>  
        <lpage>20</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24699806"/>
        </comment>  
        <pub-id pub-id-type="doi">10.3945/jn.113.185793</pub-id>
        <pub-id pub-id-type="medline">24699806</pub-id>
        <pub-id pub-id-type="pii">jn.113.185793</pub-id>
        <pub-id pub-id-type="pmcid">PMC4018946</pub-id></nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hassanien</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Azar</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Snasel</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Kacprzyk</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Abawajy</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <source>Big Data in Complex Systems: Challenges and Opportunities Berlin, Germany</source>  
        <year>2015</year>  
        <publisher-loc>New York City, United states of America</publisher-loc>
        <publisher-name>Springer Publishing Company</publisher-name></nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Selvi</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Priyaa</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>A Perspective Analysis on Removal of Duplicate Records using Data Mining Techniques: A Survey</article-title>
        <source>International Journal of Engineering Technology Science and Research</source>  
        <year>1</year>  
        <volume>20163</volume>  
        <issue>12</issue>  
        <fpage>36</fpage>  
        <lpage>41</lpage> </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Higazy</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>El</surname>
            <given-names>TT</given-names>
          </name>
          <name name-style="western">
            <surname>Yousef</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Sarhan</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Web-based Arabic/English duplicate record detection with nested blocking technique</article-title>
        <year>2013</year>  
        <conf-name>Computer Engineering &#38; Systems (ICCES), 8th International Conference on IEEE</conf-name>
        <conf-date>2013</conf-date>
        <conf-loc>Cairo, Egypt</conf-loc>
        <fpage>313</fpage>  
        <lpage>318</lpage> </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Elmagarmid</surname>
            <given-names>AK</given-names>
          </name>
          <name name-style="western">
            <surname>Ipeirotis</surname>
            <given-names>PG</given-names>
          </name>
          <name name-style="western">
            <surname>Verykios</surname>
            <given-names>VS</given-names>
          </name>
        </person-group>
        <article-title>Duplicate Record Detection: A Survey</article-title>
        <source>IEEE Trans. Knowl. Data Eng</source>  
        <year>2007</year>  
        <month>1</month>  
        <volume>19</volume>  
        <issue>1</issue>  
        <fpage>1</fpage>  
        <lpage>16</lpage>  
        <pub-id pub-id-type="doi">10.1109/Tkde.2007.250581</pub-id></nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Larsson</surname>
            <given-names>Per</given-names>
          </name>
        </person-group>
        <source>courses.cs.washington.edu</source>  
        <year>2013</year>  
        <access-date>2018-11-14</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://courses.cs.washington.edu/courses/cse544/13sp/final-projects/p12-plarsson.pdf">https://courses.cs.washington.edu/courses/cse544/13sp/final-projects/p12-plarsson.pdf</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="73utiScvt"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Medeiros</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Using regular expressions for data management in Stata</article-title>
        <source>West Coast Stata Users' Group Meetings, Stata Users Group</source>  
        <year>2007</year>  
        <fpage>-</fpage> </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
        <source>Clustering In Depth</source>  
        <access-date>2018-11-14</access-date>
        <comment>2016 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth">https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="73utyFlHG"/></comment> </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Maletic</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Marcus</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Data Cleansing: Beyond Integrity Analysis</article-title>
        <source>IQ</source>  
        <year>2000</year>  
        <fpage>2000</fpage>  
        <lpage>209</lpage> </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chu</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>Ilyas</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Krishnan</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Data cleaning: Overview and emerging challenges</article-title>
        <year>2016</year>  
        <conf-name>Proceedings of the International Conference on Management of Data. ACM</conf-name>
        <conf-date>2016</conf-date>
        <conf-loc>San Francisco, CA, USA</conf-loc>
        <fpage>2201</fpage>  
        <lpage>2206</lpage> </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
