package com.googlecode.gaal.data.impl;
import static org.junit.Assert.assertTrue;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.junit.Before;
import org.junit.Test;
import com.googlecode.gaal.analysis.impl.Analyser;
import com.googlecode.gaal.data.api.Corpus;
import com.googlecode.gaal.data.api.IntSequence;
import com.googlecode.gaal.preprocess.api.Tokenizer;
import com.googlecode.gaal.preprocess.api.Tokenizer.Document;
import com.googlecode.gaal.preprocess.impl.LowerCaseNormalizer;
import com.googlecode.gaal.preprocess.impl.RegexTokenizer;
public class CorpusTest {
private static final Set<String> SEPARATORS;
static {
SEPARATORS = new HashSet<String>();
SEPARATORS.add(".");
SEPARATORS.add(",");
SEPARATORS.add("(");
SEPARATORS.add(")");
}
// private static final int ALPHABET_SIZE = 2780;
private static final String CHAR_REGEX = "[\\W\\w]";
private static final String TINY_CORPUS = "caggtcagtcacggtatca#";
private static final String[] TINY_CORPUS_ALPHABET = { null, "a", "c", "g", "t", "#" };
private static final IntSequence SEQUENCE = new ArraySequence(new int[] { 2, 1, 3, 3, 4, 2, 1, 3, 4, 2, 1, 2, 3, 3,
4, 1, 4, 2, 1, 5 });
private Corpus<String> corpus;
@Before
public void setUp() throws Exception {
corpus = createTinyCorpus();
}
@Test
public void testTokenizer() {
StringReader sr = new StringReader(TINY_CORPUS);
Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
for (Document<String> doc : tokenizer) {
int index = 0;
for (String token : doc) {
String expected = TINY_CORPUS.substring(index, index + 1);
if (!expected.equals(token)) {
System.out.format("'%s'!='%s'\n", expected, token);
}
assertTrue(expected.equals(token));
index++;
}
}
}
@Test
public void testSequence() {
IntSequence sequence = corpus.sequence();
System.out.println(sequence);
assertTrue(sequence.size() == SEQUENCE.size());
for (int i = 0; i < SEQUENCE.size(); i++) {
assertTrue(sequence.get(i) == SEQUENCE.get(i));
}
}
@Test
public void testAlphabet() {
Set<String> alphabet = corpus.alphabet();
for (String symbol : TINY_CORPUS_ALPHABET) {
if (symbol != null) {
assertTrue(alphabet.contains(symbol));
}
}
}
@Test
public void testAlphabetSize() throws FileNotFoundException {
assertTrue(corpus.alphabetSize() == TINY_CORPUS_ALPHABET.length - 1);
FileReader reader = new FileReader("data/tlg.txt");
Tokenizer<String> tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
Corpus<String> corpus1 = new TreeMapCorpus(tokenizer);
reader = new FileReader("data/tlg.txt");
tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
Corpus<String> corpus2 = new TreeMapCorpus(tokenizer, new HashSet<String>());
System.out.println(corpus1.alphabetSize() + " " + corpus2.alphabetSize());
// assertTrue(corpus1.alphabetSize() == ALPHABET_SIZE);
assertTrue(corpus1.alphabetSize() == corpus2.alphabetSize());
}
@Test
public void testToInt() {
String[] strings = TINY_CORPUS.split("");
assertTrue(strings.length == SEQUENCE.size() + 1);
for (int i = 0; i < SEQUENCE.size(); i++) {
assertTrue(corpus.toInt(strings[i + 1]) == SEQUENCE.get(i));
}
}
@Test
public void testToToken() {
String[] strings = TINY_CORPUS.split("");
assertTrue(strings.length == SEQUENCE.size() + 1);
for (int i = 0; i < SEQUENCE.size(); i++) {
assertTrue(strings[i + 1].equals(corpus.toToken(SEQUENCE.get(i))));
}
}
public static Corpus<String> createMississippiCorpus() {
String text = "mississippi#";
String[] alphabet = { null, "i", "m", "p", "s", "#" };
StringReader sr = new StringReader(text);
Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
return new TreeMapCorpus(tokenizer, alphabet);
}
public static Corpus<String> createMiningEngineeringCorpus() {
String text = "mining␣engineering#";
String[] alphabet = { null, "e", "g", "i", "m", "n", "r", "␣", "#" };
StringReader sr = new StringReader(text);
Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
return new TreeMapCorpus(tokenizer, alphabet);
}
public static Corpus<String> createTinyCorpus() {
StringReader sr = new StringReader(TINY_CORPUS);
Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
return new TreeMapCorpus(tokenizer, TINY_CORPUS_ALPHABET);
}
public static Corpus<String> createSmallCorpus() throws FileNotFoundException {
FileReader reader = new FileReader("data/tlg.txt");
Tokenizer<String> tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
return new TreeMapCorpus(tokenizer, SEPARATORS);
}
public static Corpus<String> createLargeCorpus() throws FileNotFoundException {
FileReader reader = new FileReader("data/moby.txt");
Tokenizer<String> tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
return new TreeMapCorpus(tokenizer, SEPARATORS);
}
}