package ivory.core.tokenize;
import static org.junit.Assert.assertTrue;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import junit.framework.JUnit4TestAdapter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.junit.Assert;
import org.junit.Test;
import edu.umd.hooka.VocabularyWritable;
public class TokenizationTest {
private String dir = "./";
private String[] languages = {"ar", "tr", "cs", "es", "de", "fr", "en"};//, "zh"};
private List<String> readInput(String file) {
List<String> lines = new ArrayList<String>();
try {
FileInputStream fis = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(fis, "UTF8");
BufferedReader in = new BufferedReader(isr);
String line;
while ((line = in.readLine()) != null) {
lines.add(line);
}
in.close();
return lines;
} catch (IOException e) {
e.printStackTrace();
Assert.fail();
}
return null;
}
public void testTokenization(String lang, String tokenizerModelFile, boolean isStem, String stopwordsFile, VocabularyWritable vocab, String inputFile, String expectedFile) throws IOException{
Tokenizer tokenizer = TokenizerFactory.createTokenizer(lang, tokenizerModelFile, isStem, stopwordsFile, null, vocab);
// two classes are not aware of the stemming/stopword options (they use default option instead): StanfordChineseTokenizer, GalagoTokenizer
assertTrue(tokenizer.isStemming() == isStem
|| (tokenizer.getClass() == StanfordChineseTokenizer.class)
|| (tokenizer.getClass() == GalagoTokenizer.class));
assertTrue(tokenizer.isStopwordRemoval() == (stopwordsFile != null)
|| (tokenizer.getClass() == StanfordChineseTokenizer.class)
|| (tokenizer.getClass()==GalagoTokenizer.class));
List<String> sentences = readInput(inputFile);
List<String> expectedSentences = readInput(expectedFile);
for (int i = 0; i < sentences.size(); i++) {
String sentence = sentences.get(i);
String[] expectedTokens = expectedSentences.get(i).split("\\s+");
System.out.println("Testing sentence:"+sentence);
String[] tokens = tokenizer.processContent(sentence);
int tokenCnt = 0;
for (String token : tokens) {
System.out.println("Token "+tokenCnt+":"+token);
assertTrue("token "+tokenCnt+":"+token+",expected="+expectedTokens[tokenCnt], token.equals(expectedTokens[tokenCnt]));
tokenCnt++;
}
}
}
public long testTokenizationTime(String lang, String tokenizerModelFile, boolean isStem, String stopwordsFile, VocabularyWritable vocab, String sentence) throws IOException{
Tokenizer tokenizer = TokenizerFactory.createTokenizer(lang, tokenizerModelFile, isStem, stopwordsFile, null, vocab);
int i = 0;
long time = System.currentTimeMillis();
while (i++ < 1000) {
tokenizer.processContent(sentence);
}
return (System.currentTimeMillis() - time);
}
@Test
public void testAllTokenization() {
try {
for (String language : languages) {
String rawFile = dir + "data/tokenizer/test/" + language + "-test.raw";
String tokenizedFile = dir + "data/tokenizer/test/" + language + "-test.tok";
String tokenizedStemmedFile = dir + "data/tokenizer/test/" + language + "-test.tok.stemmed";
String tokenizedStopFile = dir + "data/tokenizer/test/" + language + "-test.tok.stop";
String tokenizedStemmedStopFile = dir + "data/tokenizer/test/" + language + "-test.tok.stemmed.stop";
String tokenizer = dir + "data/tokenizer/" + language + "-token.bin";
String stopwords = dir + "data/tokenizer/" + language + ".stop";
testTokenization(language, tokenizer, false, null, null, rawFile, tokenizedFile);
testTokenization(language, tokenizer, true, null, null, rawFile, tokenizedStemmedFile);
testTokenization(language, tokenizer, false, stopwords, null, rawFile, tokenizedStopFile);
testTokenization(language, tokenizer, true, stopwords, null, rawFile, tokenizedStemmedStopFile);
// for Lucene tokenizers, everything should work without model file
if (language.equals("cs") || language.equals("ar") || language.equals("tr") || language.equals("es")) {
testTokenization(language, null, false, null, null, rawFile, tokenizedFile);
testTokenization(language, null, true, null, null, rawFile, tokenizedStemmedFile);
testTokenization(language, null, false, stopwords, null, rawFile, tokenizedStopFile);
testTokenization(language, null, true, stopwords, null, rawFile, tokenizedStemmedStopFile);
}
if (language.equals("en")) {
// stemming = true or false should have same output (since stemming is default)
testTokenization(language, null, false, null, null, rawFile, tokenizedFile + "-galago");
testTokenization(language, null, true, null, null, rawFile, tokenizedFile + "-galago");
}
}
} catch (IOException e) {
Assert.fail("Error in tokenizer test: " + e.getMessage());
}
}
@Test
public void testTokenizationTime() {
String[] languages = {"ar", "tr", "cs", "es", "de", "en", "zh"};
try {
for (String language : languages) {
String tokenizer = dir + "data/tokenizer/" + language + "-token.bin";
String stopwords = dir + "data/tokenizer/" + language + ".stop";
long time = testTokenizationTime(language, tokenizer, true, stopwords, null, "Although they are at temperatures of roughly 3000–4500 K (2727–4227 °C),");
System.out.println("Tokenization for " + language + " : " + (time/1000f) + "ms/sentence");
}
} catch (IOException e) {
Assert.fail("Error in tokenizer test: " + e.getMessage());
}
}
public void testOOV(String language, VocabularyWritable vocab, boolean isStemming, boolean isStopwordRemoval, float[] expectedOOVRates) {
Tokenizer tokenizer;
Configuration conf = new Configuration();
try {
if (isStopwordRemoval) {
tokenizer = TokenizerFactory.createTokenizer(FileSystem.getLocal(conf), conf, language, dir + "data/tokenizer/" + language + "-token.bin", isStemming, dir + "data/tokenizer/" + language + ".stop", dir + "data/tokenizer/" + language + ".stop.stemmed", null);
}else {
tokenizer = TokenizerFactory.createTokenizer(FileSystem.getLocal(conf), conf, language, dir + "data/tokenizer/" + language + "-token.bin", isStemming, null, null, null);
}
} catch (IOException e) {
Assert.fail("Unable to create tokenizer.");
return;
}
List<String> sentences = readInput(dir + "data/tokenizer/test/" + language + "-test.raw");
for (int i = 0; i < sentences.size(); i++) {
String sentence = sentences.get(i);
float oovRate = tokenizer.getOOVRate(sentence, vocab);
assertTrue( "Sentence " + i + ":" + oovRate + "!=" + expectedOOVRates[i] , oovRate == expectedOOVRates[i] );
}
}
@Test
public void testChineseOOVs() {
VocabularyWritable vocab = new VocabularyWritable();
List<String> sentences = readInput(dir + "data/tokenizer/test/zh-test.tok.stemmed.stop");
for (String token : sentences.get(3).split(" ")) {
vocab.addOrGet(token);
}
vocab.addOrGet("1457");
vocab.addOrGet("19");
float[] zhExpectedOOVRates = {0.6666667f, 0.8666667f, 0.72727275f, 0f}; // all same since no stemming or stopword removal
testOOV("zh", vocab, true, true, zhExpectedOOVRates);
testOOV("zh", vocab, false, true, zhExpectedOOVRates);
testOOV("zh", vocab, true, false, zhExpectedOOVRates);
testOOV("zh", vocab, false, false, zhExpectedOOVRates);
}
@Test
public void testTurkishOOVs() {
VocabularyWritable vocab = new VocabularyWritable();
List<String> sentences = readInput(dir + "data/tokenizer/test/tr-test.tok.stemmed.stop");
for (String token : sentences.get(3).split(" ")) {
vocab.addOrGet(token);
}
vocab.addOrGet("ispanyol");
vocab.addOrGet("isim");
vocab.addOrGet("10");
float[] trStopStemExpectedOOVRates = {0.85714287f, 1f, 0.6f, 0f};
float[] trStopExpectedOOVRates = {1f, 1f, 0.8f, 0.5f};
float[] trStemExpectedOOVRates = {0.85714287f, 1f, 0.71428573f, 0.33333334f};
float[] trExpectedOOVRates = {1f, 1f, 0.85714287f, 0.6666667f};
testOOV("tr", vocab, true, true, trStopStemExpectedOOVRates);
testOOV("tr", vocab, false, true, trStopExpectedOOVRates);
testOOV("tr", vocab, true, false, trStemExpectedOOVRates);
testOOV("tr", vocab, false, false, trExpectedOOVRates);
}
@Test
public void testArabicOOVs() {
VocabularyWritable vocab = new VocabularyWritable();
List<String> sentences = readInput(dir + "data/tokenizer/test/ar-test.tok.stemmed.stop");
for (String token : sentences.get(0).split(" ")) {
vocab.addOrGet(token);
}
vocab.addOrGet("2011");
float[] arStopStemExpectedOOVRates = {0f, 1f, 0.8181818f, 1f};
float[] arStopExpectedOOVRates = {0.6666667f, 1f, 0.8181818f, 1f};
float[] arStemExpectedOOVRates = {0f, 1f, 0.85714287f, 1f};
float[] arExpectedOOVRates = {0.6666667f, 1f, 0.85714287f, 1f};
testOOV("ar", vocab, true, true, arStopStemExpectedOOVRates);
testOOV("ar", vocab, false, true, arStopExpectedOOVRates);
testOOV("ar", vocab, true, false, arStemExpectedOOVRates);
testOOV("ar", vocab, false, false, arExpectedOOVRates);
}
@Test
public void testEnglishOOVs() {
VocabularyWritable vocab = new VocabularyWritable();
vocab.addOrGet("r.d.");
vocab.addOrGet("craig");
vocab.addOrGet("dictionari");
vocab.addOrGet("polynesian");
vocab.addOrGet("mytholog");
vocab.addOrGet("greenwood");
vocab.addOrGet("press");
vocab.addOrGet("new");
vocab.addOrGet("york");
vocab.addOrGet("1989");
vocab.addOrGet("24");
vocab.addOrGet("26");
vocab.addOrGet("english");
vocab.addOrGet("tree");
vocab.addOrGet("einbaum");
float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};
float[] enStemExpectedOOVRates = {1f, 36/37f, 15/18.0f, 7/19f};
float[] enExpectedOOVRates = {1f, 36/37f, 15/18.0f, 9/19f};
testOOV("en", vocab, true, true, enStopStemExpectedOOVRates);
testOOV("en", vocab, false, true, enStopExpectedOOVRates);
testOOV("en", vocab, true, false, enStemExpectedOOVRates);
testOOV("en", vocab, false, false, enExpectedOOVRates);
}
public static junit.framework.Test suite() {
return new JUnit4TestAdapter(TokenizationTest.class);
}
}