package com.googlecode.gaal.analysis.impl;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import com.googlecode.gaal.analysis.api.Context;
import com.googlecode.gaal.analysis.api.Filter;
import com.googlecode.gaal.analysis.api.IntervalSetBuilder;
import com.googlecode.gaal.analysis.impl.Aligner.AlignmentQueue;
import com.googlecode.gaal.analysis.impl.EmbeddedIntervalExtractor.EmbeddedContext;
import com.googlecode.gaal.data.api.Corpus;
import com.googlecode.gaal.data.impl.ArraySequence;
import com.googlecode.gaal.data.impl.CorpusTest;
import com.googlecode.gaal.data.impl.TreeMapCorpus;
import com.googlecode.gaal.preprocess.api.Tokenizer;
import com.googlecode.gaal.preprocess.impl.LowerCaseNormalizer;
import com.googlecode.gaal.preprocess.impl.MultidocumentRegexTokenizer;
import com.googlecode.gaal.preprocess.impl.StopWordRemover;
import com.googlecode.gaal.suffix.api.EmbeddedSuffixTree.EmbeddedInterval;
import com.googlecode.gaal.suffix.api.IntervalTree.Interval;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree.BinaryInterval;
import com.googlecode.gaal.suffix.impl.LinearizedSuffixTreeImpl;
public class Analyser {
public static final String STRING_REGEX = "([A-Za-zÜüÄäÖöẞß0-9]+)|[/'\"\\.,;:!\\?\\(\\)]";
public static final Set<String> SEPARATORS = new HashSet<String>(Arrays.asList(new String[] { ".", ",", ";", "(",
")", ":", "!", "?" }));
public static final Set<String> STOP_WORDS = Collections.emptySet();
private static String delimiter = " ";
protected final Corpus<String> srcCorpus;
protected final Corpus<String> dstCorpus;
protected LinearizedSuffixTree srcLST;
protected LinearizedSuffixTree dstLST;
protected final ArraySequence srcSequence;
protected final ArraySequence dstSequence;
public Analyser(final Corpus<String> srcCorpus, final Corpus<String> dstCorpus) {
this(srcCorpus, dstCorpus, true);
}
protected Analyser(final Corpus<String> srcCorpus, final Corpus<String> dstCorpus, boolean buildLST) {
this.srcCorpus = srcCorpus;
this.dstCorpus = dstCorpus;
srcSequence = srcCorpus.sequence();
Stopwatch stopwatch = null;
if (Aligner.isVerbose()) {
stopwatch = new Stopwatch();
stopwatch.start();
}
if (buildLST) {
srcLST = new LinearizedSuffixTreeImpl(srcSequence, srcCorpus.alphabetSize());
if (stopwatch != null) {
System.out.printf("constructed source suffix arrays in %s\n", stopwatch);
}
}
if (dstCorpus != null) {
dstSequence = dstCorpus.sequence();
if (buildLST) {
if (stopwatch != null) {
stopwatch.reset();
}
dstLST = new LinearizedSuffixTreeImpl(dstSequence, dstCorpus.alphabetSize());
if (stopwatch != null) {
System.out.printf("constructed target suffix arrays in %s\n", stopwatch);
}
}
} else {
dstSequence = null;
dstLST = null;
}
}
public static void main(String[] args) throws IOException {
// simpleTest();
testRecursiveAlignments();
}
public static void testRecursiveAlignments() throws FileNotFoundException {
String srcFileName = "data/de-en.en";
String dstFileName = "data/de-en.de";
Set<String> srcStopWords = new HashSet<String>(Arrays.asList(new String[] { "a", "the" }));
Set<String> dstStopWords = new HashSet<String>(Arrays.asList(new String[] { "ein", "eine", "einem", "einen",
"einer", "eines", "das", "dem", "den", "der", "des", "die" }));
Set<String> srcSeparators = new HashSet<String>(
Arrays.asList(new String[] { ".", ",", ";", "", "(", ")", "of" }));
Set<String> dstSeparators = new HashSet<String>(Arrays.asList(new String[] { ".", ",", ";", "(", ")", "von" }));
FileReader srcReader = new FileReader(srcFileName);
FileReader dstReader = new FileReader(dstFileName);
Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, STRING_REGEX, new StopWordRemover(
srcStopWords, new LowerCaseNormalizer()));
Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, STRING_REGEX, new StopWordRemover(
dstStopWords, new LowerCaseNormalizer()));
Corpus<String> srcCorpus = new TreeMapCorpus(srcTokenizer, srcSeparators);
Corpus<String> dstCorpus = new TreeMapCorpus(dstTokenizer, dstSeparators);
double minSimilarity = 0.3;
int alignmentsNumber = 3;
int minVectorSize = 5;
int windowSize = 9;
double minLeftRightContextRatio = 0;
int minNonEmptyFillerCount = 0;
int srcDepth = 1;
int srcMaxDepth = -1;
int dstDepth = 0;
int dstMaxDepth = -1;
Analyser analyser = new Analyser(srcCorpus, dstCorpus);
IntervalSetBuilder intervalSetBuilder = new NonSingletonBwtSetBuilder();
// IntervalSetBuilder intervalSetBuilder = new
// ProperIntervalSetBuilder();
analyser.printRecursiveAlignments(intervalSetBuilder, minLeftRightContextRatio, minNonEmptyFillerCount,
minSimilarity, alignmentsNumber, minVectorSize, windowSize, srcDepth, srcMaxDepth, dstDepth,
dstMaxDepth);
}
public static void simpleTest() {
int windowSize = 9;
Analyser analyser = new Analyser(CorpusTest.createMiningEngineeringCorpus(), null);
IntervalSetBuilder intervalSetBuilder = new NonSingletonBwtSetBuilder();
analyser.printSrcContexts(intervalSetBuilder, " ", windowSize);
}
public void printSrcCorpusInfo() {
System.out.println("Text size: " + srcCorpus.sequence().size());
System.out.println("Alphabet size: " + srcCorpus.alphabetSize());
}
public void printSrcByDoc() {
for (int i = 0; i < srcSequence.size(); i++) {
System.out.printf("i:%d,Doc:%d, Token:%s\n", i, srcCorpus.getDocumentId(i),
srcCorpus.toToken(srcSequence.get(i)));
srcSequence.get(i);
}
}
public void printDstByDoc() {
for (int i = 0; i < dstSequence.size(); i++) {
System.out.printf("i:%d,Doc:%d, Token:%s\n", i, dstCorpus.getDocumentId(i),
dstCorpus.toToken(dstSequence.get(i)));
dstSequence.get(i);
}
}
public void printSrcContexts(IntervalSetBuilder intervalSetBuilder, String delimiter, int windowSize) {
printContexts(srcCorpus, srcLST, intervalSetBuilder, delimiter, windowSize);
}
public void printDstContexts(IntervalSetBuilder intervalSetBuilder, String delimiter, int windowSize) {
printContexts(dstCorpus, dstLST, intervalSetBuilder, delimiter, windowSize);
}
public void printContexts(Corpus<String> corpus, LinearizedSuffixTree lst, IntervalSetBuilder intervalSetBuilder,
String delimiter, int windowSize) {
Filter<EmbeddedInterval> contextFilter = new SimpleContextFilter(0.5, 1);
Iterable<EmbeddedInterval> contextExtractor = new EmbeddedContextExtractor(lst, corpus, intervalSetBuilder,
contextFilter, windowSize);
for (EmbeddedInterval embeddedInterval : contextExtractor) {
Context context = new EmbeddedContext(embeddedInterval);
Interval interval = embeddedInterval.getEmbeddingInterval();
double leftRightContextRatio = (double) embeddedInterval.size() / interval.size();
System.out.printf("%s %s %s\n", corpus.toString(context.leftSequence(), delimiter),
corpus.toString(embeddedInterval.fillerIterator(), delimiter),
corpus.toString(context.rightSequence(), delimiter));
System.out.printf("ratio:%.2f\n", leftRightContextRatio);
}
}
public void printSrcRecursiveIntervals(IntervalSetBuilder intervalSetBuilder, double minLeftRightContextRatio,
int minNonEmptyFillerCount, int depth, int maxDepth, int windowSize) {
Filter<EmbeddedInterval> contextFilter = new SimpleContextFilter(minLeftRightContextRatio,
minNonEmptyFillerCount);
printRecursiveIntervals(srcCorpus, srcLST, intervalSetBuilder, contextFilter, windowSize, depth, maxDepth);
}
public void printRecursiveIntervals(Corpus<String> corpus, LinearizedSuffixTree lst,
IntervalSetBuilder intervalSetBuilder, Filter<EmbeddedInterval> contextFilter, int windowSize, int depth,
int maxDepth) {
Iterator<Interval> intervalIterator = new RecursiveIntervalExtractor(lst, corpus, intervalSetBuilder,
contextFilter, windowSize).iterator(depth, maxDepth);
while (intervalIterator.hasNext()) {
Interval interval = intervalIterator.next();
System.out.println(toString(interval, corpus, false));
}
}
protected <C extends Interval> Aligner<C> getAligner(IntervalVectorBuilder<C> vectorBuilder,
Iterator<C> srcIterator, Iterator<C> dstIterator, Corpus<String> srcCorpus, Corpus<String> dstCorpus,
double minSimilarity, int alignmentsNumber, int minVectorSize) {
return new Aligner<C>(vectorBuilder, srcIterator, dstIterator, srcCorpus, dstCorpus, minSimilarity,
alignmentsNumber, minVectorSize);
}
protected Iterable<EmbeddedInterval> getContextExtractor(LinearizedSuffixTree lst, Corpus<String> corpus,
IntervalSetBuilder intervalSetBuilder, Filter<EmbeddedInterval> contextFilter, int windowSize) {
return new EmbeddedContextExtractor(lst, corpus, intervalSetBuilder, contextFilter, windowSize);
}
public void printIntervalAlignments(double minSimilarity, int alignmentsNumber, int minVectorSize) {
IntervalSetBuilder intervalSetBuilder = new ProperIntervalSetBuilder();
Aligner<BinaryInterval> aligner = getAligner(new IntervalVectorBuilder<BinaryInterval>(), intervalSetBuilder
.buildIntervalSet(srcLST).iterator(), intervalSetBuilder.buildIntervalSet(dstLST).iterator(),
srcCorpus, dstCorpus, minSimilarity, alignmentsNumber, minVectorSize);
for (AlignmentQueue<BinaryInterval> queue : aligner) {
for (AlignmentQueue<BinaryInterval>.Alignment alignment : queue) {
print(alignment, false);
}
}
}
public void printContextAlignments(double minSimilarity, int alignmentsNumber, int minVectorSize, int windowSize) {
Filter<EmbeddedInterval> contextFilter = new SimpleContextFilter(0.5, 1);
Iterable<EmbeddedInterval> srcContextExtractor = getContextExtractor(srcLST, srcCorpus,
new ProperIntervalSetBuilder(), contextFilter, windowSize);
Iterable<EmbeddedInterval> dstContextExtractor = getContextExtractor(dstLST, dstCorpus,
new ProperIntervalSetBuilder(), contextFilter, windowSize);
Aligner<EmbeddedInterval> aligner = getAligner(new IntervalVectorBuilder<EmbeddedInterval>(),
srcContextExtractor.iterator(), dstContextExtractor.iterator(), srcCorpus, dstCorpus, minSimilarity,
alignmentsNumber, minVectorSize);
for (AlignmentQueue<EmbeddedInterval> queue : aligner) {
for (AlignmentQueue<EmbeddedInterval>.Alignment alignment : queue) {
print(alignment, false);
}
}
}
public void printRecursiveAlignments(IntervalSetBuilder intervalSetBuilder, double minLeftRightContextRatio,
int minNonEmptyFillerCount, double minSimilarity, int alignmentsNumber, int minVectorSize, int windowSize,
int srcDepth, int srcMaxDepth, int dstDepth, int dstMaxDepth) {
printRecursiveAlignments(intervalSetBuilder, minLeftRightContextRatio, minNonEmptyFillerCount, minSimilarity,
alignmentsNumber, minVectorSize, windowSize, srcDepth, srcMaxDepth, dstDepth, dstMaxDepth, false);
}
public void printRecursiveAlignments(IntervalSetBuilder intervalSetBuilder, double minLeftRightContextRatio,
int minNonEmptyFillerCount, double minSimilarity, int alignmentsNumber, int minVectorSize, int windowSize,
int srcDepth, int srcMaxDepth, int dstDepth, int dstMaxDepth, boolean asLaTeX) {
Filter<EmbeddedInterval> contextFilter = new SimpleContextFilter(minLeftRightContextRatio,
minNonEmptyFillerCount);
Iterator<Interval> srcIntervalIterator = new RecursiveIntervalExtractor(srcLST, srcCorpus, intervalSetBuilder,
contextFilter, windowSize).iterator(srcDepth, srcMaxDepth);
Iterator<Interval> dstIntervalIterator = new RecursiveIntervalExtractor(dstLST, dstCorpus, intervalSetBuilder,
contextFilter, windowSize).iterator(dstDepth, dstMaxDepth);
Aligner<Interval> aligner = getAligner(new IntervalVectorBuilder<Interval>(), srcIntervalIterator,
dstIntervalIterator, srcCorpus, dstCorpus, minSimilarity, alignmentsNumber, minVectorSize);
for (AlignmentQueue<Interval> queue : aligner) {
for (AlignmentQueue<Interval>.Alignment alignment : queue) {
print(alignment, asLaTeX);
}
}
}
public void print(AlignmentQueue<? extends Interval>.Alignment alignment, boolean asLaTeX) {
String source = toString(alignment.getSource(), srcCorpus, asLaTeX);
String target = toString(alignment.getTarget(), dstCorpus, asLaTeX);
if (asLaTeX) {
System.out.printf("%s & %s & %.2f\\\\\n\\hline\n", source, target, alignment.getSimilarity());
} else {
System.out.printf("%s\n~\n%s(%.2f)\n\n", source, target, alignment.getSimilarity());
}
}
public static <S> String toString(Interval interval, Corpus<S> corpus, boolean asLaTeX) {
if (interval instanceof EmbeddedInterval) {
EmbeddedInterval embeddedInterval = (EmbeddedInterval) interval;
String embeddingLabel = toString(embeddedInterval.getEmbeddingInterval(), corpus, asLaTeX);
String embeddedLabel = corpus.toString(embeddedInterval.label(), delimiter);
if (asLaTeX) {
return String.format("%s \\dots %s", embeddingLabel, embeddedLabel);
} else {
return String.format("%s %s %s", embeddingLabel,
corpus.toString(embeddedInterval.fillerSet(), delimiter), embeddedLabel);
}
}
return String.format("%s", corpus.toString(interval.label(), delimiter));
}
public static Set<String> toStringSet(Reader reader) throws IOException {
Set<String> stringSet = new TreeSet<String>();
BufferedReader bufferedReader = new BufferedReader(reader);
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
stringSet.add(line);
}
return stringSet;
}
public static void setDelimiter(String delimiter) {
Analyser.delimiter = delimiter;
}
public static String getDelimiter() {
return Analyser.delimiter;
}
}