@inproceedings{1b0551bc2233435b953a525515a151fb,
title = "SCOOP: A record extractor without knowledge on input",
abstract = "We present a record extractor system SCOOP. We assume that semi-structured documents given to SCOOP contain similar formats and each of them has only a record consisting of some different fields. SCOOP treats a document as just a string and does not use knowledge on input except that a field is surrounded with delimiters, a left delimiter ends with “>”, and the corresponding right delimiter begins with “<”. By counting substrings, SCOOP roughly divides into two parts: contents of the fields and others. SCOOP counts substrings near boundaries of two parts and extracts the most frequent substrings as delimiters. We show experimental results with news articles written in English or Japanese. A record consists of the headline and the body text on this experiment. SCOOP extracts records at a high rate.",
author = "Yasuhiro Yamada and Daisuke Ikeda and Sachio Hirokawa",
year = "2001",
doi = "10.1007/3-540-45650-3_45",
language = "English",
isbn = "9783540429562",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "482--487",
editor = "Jantke, {Klaus P.} and Ayumi Shinohara",
booktitle = "Discovery Science - 4th International Conference, DS 2001, Proceedings",
address = "Germany",
note = "4th International Conference on Discovery Science, DS 2001 ; Conference date: 25-11-2001 Through 28-11-2001",
}