package com.googlecode.gaal.cli;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.OptionGroup;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import com.googlecode.gaal.analysis.api.IntervalSetBuilder;
import com.googlecode.gaal.analysis.impl.Aligner;
import com.googlecode.gaal.analysis.impl.Analyser;
import com.googlecode.gaal.analysis.impl.ConcurrentAnalyser;
import com.googlecode.gaal.analysis.impl.NonSingletonBwtSetBuilder;
import com.googlecode.gaal.analysis.impl.ProperIntervalSetBuilder;
import com.googlecode.gaal.analysis.impl.SupermaximalSetBuilder;
import com.googlecode.gaal.data.api.Corpus;
import com.googlecode.gaal.data.impl.TreeMapCorpus;
import com.googlecode.gaal.preprocess.api.Tokenizer;
import com.googlecode.gaal.preprocess.api.Tokenizer.Document;
import com.googlecode.gaal.preprocess.impl.LowerCaseNormalizer;
import com.googlecode.gaal.preprocess.impl.MultidocumentRegexTokenizer;
import com.googlecode.gaal.preprocess.impl.StopWordRemover;
public class Main {
private static Analyser ANALYSER;
public static Analyser getAnalyser(String srcFileName, String dstFileName, int corpusSize, String regex,
Set<String> srcStopWords, Set<String> dstStopWords, Set<String> srcSeparators, Set<String> dstSeparators) {
if (ANALYSER == null) {
FileReader srcReader = null;
FileReader dstReader = null;
try {
srcReader = new FileReader(srcFileName);
} catch (FileNotFoundException e) {
System.err.printf("can't open source file: ", e.getMessage());
System.exit(1);
}
Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, regex, new StopWordRemover(
srcStopWords, new LowerCaseNormalizer()));
Corpus<String> srcCorpus = new TreeMapCorpus(srcTokenizer, srcSeparators, corpusSize);
Corpus<String> dstCorpus = null;
if (dstFileName != null) {
try {
dstReader = new FileReader(dstFileName);
} catch (FileNotFoundException e) {
System.err.printf("can't open target file: ", e.getMessage());
System.exit(1);
}
Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, regex, new StopWordRemover(
dstStopWords, new LowerCaseNormalizer()));
dstCorpus = new TreeMapCorpus(dstTokenizer, dstSeparators, corpusSize);
}
ANALYSER = new ConcurrentAnalyser(srcCorpus, dstCorpus);
}
return ANALYSER;
}
public static Set<String> readStringSet(String fileName) {
try {
return Analyser.toStringSet(new FileReader(fileName));
} catch (FileNotFoundException e) {
System.err.printf("file not found: ", e.getMessage());
System.exit(1);
} catch (IOException e) {
System.err.printf("can't open file: ", e.getMessage());
System.exit(1);
}
return null;
}
public static String readString(String fileName) {
StringBuilder sb = new StringBuilder();
try {
BufferedReader bufferedReader = new BufferedReader(new FileReader(fileName));
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
sb.append(line);
sb.append('\n');
}
} catch (FileNotFoundException e) {
System.err.printf("file not found: ", e.getMessage());
System.exit(1);
} catch (IOException e) {
System.err.printf("can't open file: ", e.getMessage());
System.exit(1);
}
return sb.toString();
}
public static void printTokens(String fileName, String regex, Set<String> stopWords, int corpusSize) {
FileReader srcReader = null;
try {
srcReader = new FileReader(fileName);
} catch (FileNotFoundException e) {
System.err.printf("can't open source file: ", e.getMessage());
System.exit(1);
}
Tokenizer<String> tokenizer = new MultidocumentRegexTokenizer(srcReader, regex, new StopWordRemover(stopWords,
new LowerCaseNormalizer()));
Iterator<Document<String>> docIter = tokenizer.iterator();
int lineCounter = 0;
while (docIter.hasNext()) {
Document<String> doc = docIter.next();
System.out.format("\nDocument #%d:\n", doc.getId());
Iterator<String> tokIter = doc.iterator();
while (tokIter.hasNext()) {
if (corpusSize != -1 && lineCounter > corpusSize) {
return;
}
String token = tokIter.next();
System.out.format("\"%s\"\n", token);
}
}
}
@SuppressWarnings("static-access")
public static void main(String[] args) throws IOException {
Options options = new Options();
Option helpOption = OptionBuilder.withLongOpt("help").withDescription("this usage help").create('h');
// input options
Option sourceOption = OptionBuilder.withArgName("SOURCE FILE").hasArg().withDescription("source file")
.create('s');
OptionGroup inputOptionGroup = new OptionGroup();
inputOptionGroup.addOption(helpOption);
inputOptionGroup.addOption(sourceOption);
inputOptionGroup.isRequired();
Option targetOption = OptionBuilder.withArgName("TARGET FILE").hasArg().withDescription("target file")
.create('t');
// general options
Option linesOption = OptionBuilder.withArgName("LINES").hasArg().withDescription("lines to use").create('l');
Option regexOption = OptionBuilder.withArgName("TOKENIZER REGEX").hasArg().withDescription("regex for tokens")
.withLongOpt("tok").create();
Option regexFileOption = OptionBuilder.withArgName("TOKENIZER REGEX FILE").hasArg()
.withDescription("regex file for tokens").withLongOpt("tokfile").create();
Option stopOption = OptionBuilder.withArgName("STOPWORD FILE").hasArg().withDescription("stopword file")
.withLongOpt("stop").create();
Option srcStopOption = OptionBuilder.withArgName("SOURCE STOPWORD FILE").hasArg()
.withDescription("source stopword file").withLongOpt("sstop").create();
Option dstStopOption = OptionBuilder.withArgName("TARGET STOPWORD FILE").hasArg()
.withDescription("target stopword file").withLongOpt("tstop").create();
Option separatorOption = OptionBuilder.withArgName("SEPARATOR FILE").hasArg().withDescription("separator file")
.withLongOpt("sep").create();
Option srcSeparatorOption = OptionBuilder.withArgName("SOURCE SEPARATOR FILE").hasArg()
.withDescription("source separator file").withLongOpt("ssep").create();
Option dstSeparatorOption = OptionBuilder.withArgName("TARGET SEPARATOR FILE").hasArg()
.withDescription("target separator file").withLongOpt("tsep").create();
Option depthOption = OptionBuilder.withArgName("DEPTH").hasArg().withDescription("gappy phrase depth")
.withLongOpt("depth").create("d");
Option maxDepthOption = OptionBuilder.withArgName("MAX DEPTH").hasArg()
.withDescription("max gappy phrase depth").withLongOpt("max-depth").create("m");
Option srcDepthOption = OptionBuilder.withArgName("SOURCE DEPTH").hasArg()
.withDescription("source gappy phrase depth").withLongOpt("src-depth").create("sd");
Option srcMaxDepthOption = OptionBuilder.withArgName("SOURCE MAX DEPTH").hasArg()
.withDescription("max source gappy phrase depth").withLongOpt("src-max-depth").create("sm");
Option dstDepthOption = OptionBuilder.withArgName("TARGET DEPTH").hasArg()
.withDescription("target gappy phrase depth").withLongOpt("trg-depth").create("td");
Option dstMaxDepthOption = OptionBuilder.withArgName("TARGET MAX DEPTH").hasArg()
.withDescription("max target gappy phrase depth").withLongOpt("trg-max-depth").create("tm");
Option wsizeOption = OptionBuilder.withArgName("WINDOW SIZE").hasArg()
.withDescription("window size of a gappy phrase").withLongOpt("wsize").create("w");
Option cratioOption = OptionBuilder.withArgName("MIN CONTEXT RATIO").hasArg()
.withDescription("minimal left/right ratio").withLongOpt("cratio").create();
Option nfillOption = OptionBuilder.withArgName("MIN NON-EMPTY FILLERS").hasArg()
.withDescription("minimal non-empty fillers count").withLongOpt("nfill").create();
Option delOption = OptionBuilder.withArgName("DELIMITER").hasArg().withDescription("delimiter for printing")
.withLongOpt("del").create();
// alignment related options
Option simOption = OptionBuilder.withArgName("MIN SIMILARITY").hasArg().withDescription("minimal similarity")
.withLongOpt("sim").create();
Option numOption = OptionBuilder.withArgName("ALIGNMENTS").hasArg().withDescription("number of top alignments")
.create('n');
Option vsizeOption = OptionBuilder.withArgName("VECTOR SIZE").hasArg().withDescription("minimal vector size")
.withLongOpt("vsize").create();
Option maxOption = OptionBuilder.withDescription("output only maximal repeats").withLongOpt("max").create();
Option smaxOption = OptionBuilder.withDescription("output only supermaximal repeats").withLongOpt("smax")
.create();
Option verboseOption = OptionBuilder.withDescription("verbose").create('v');
Option laTeXOption = OptionBuilder.withDescription("output LaTeX when possible").create('x');
// actions
OptionGroup actionOptionGroup = new OptionGroup();
actionOptionGroup.addOption(OptionBuilder.withDescription("print allignments").create('a'));
actionOptionGroup.addOption(OptionBuilder.withDescription("print corpus info").create('i'));
actionOptionGroup.addOption(OptionBuilder.withDescription("print tokens").create('z'));
actionOptionGroup.addOption(OptionBuilder.withDescription("print repeats").create('r'));
actionOptionGroup.isRequired();
options.addOptionGroup(inputOptionGroup);
options.addOptionGroup(actionOptionGroup);
options.addOption(targetOption);
options.addOption(linesOption);
options.addOption(regexOption);
options.addOption(regexFileOption);
options.addOption(stopOption);
options.addOption(srcStopOption);
options.addOption(dstStopOption);
options.addOption(separatorOption);
options.addOption(srcSeparatorOption);
options.addOption(dstSeparatorOption);
options.addOption(simOption);
options.addOption(numOption);
options.addOption(vsizeOption);
options.addOption(wsizeOption);
options.addOption(cratioOption);
options.addOption(nfillOption);
options.addOption(delOption);
options.addOption(depthOption);
options.addOption(maxDepthOption);
options.addOption(srcDepthOption);
options.addOption(srcMaxDepthOption);
options.addOption(dstDepthOption);
options.addOption(dstMaxDepthOption);
options.addOption(verboseOption);
options.addOption(laTeXOption);
options.addOption(maxOption);
options.addOption(smaxOption);
CommandLineParser parser = new GnuParser();
try {
CommandLine line = parser.parse(options, args);
int corpusSize = -1;
if (line.hasOption('l')) {
corpusSize = Integer.parseInt(line.getOptionValue('l'));
}
int srcDepth = -1;
int dstDepth = -1;
if (line.hasOption('d')) {
srcDepth = Integer.parseInt(line.getOptionValue('d'));
dstDepth = srcDepth;
} else {
if (line.hasOption("sd")) {
srcDepth = Integer.parseInt(line.getOptionValue("sd"));
}
if (line.hasOption("td")) {
dstDepth = Integer.parseInt(line.getOptionValue("td"));
}
}
int srcMaxDepth = -1;
int dstMaxDepth = -1;
if (line.hasOption('m')) {
srcMaxDepth = Integer.parseInt(line.getOptionValue('m'));
dstMaxDepth = srcMaxDepth;
} else {
if (line.hasOption("sm")) {
srcMaxDepth = Integer.parseInt(line.getOptionValue("sm"));
}
if (line.hasOption("tm")) {
dstMaxDepth = Integer.parseInt(line.getOptionValue("trg-max-depth"));
}
}
int windowSize = 5;
if (line.hasOption('w')) {
windowSize = Integer.parseInt(line.getOptionValue('w'));
}
double minLeftRightContextRatio = 0.5;
if (line.hasOption("cratio")) {
minLeftRightContextRatio = Double.parseDouble(line.getOptionValue("cratio"));
}
int minNonEmptyFillerCount = 1;
if (line.hasOption("nfill")) {
minNonEmptyFillerCount = Integer.parseInt(line.getOptionValue("nfill"));
}
if (line.hasOption("del")) {
Analyser.setDelimiter(line.getOptionValue("del"));
}
String regex = Analyser.STRING_REGEX;
if (line.hasOption("tok")) {
regex = line.getOptionValue("tok");
}
if (line.hasOption("tokfile")) {
regex = readString(line.getOptionValue("tokfile"));
}
Set<String> srcStopWords = Analyser.STOP_WORDS;
Set<String> dstStopWords = Analyser.STOP_WORDS;
if (line.hasOption("stop")) {
srcStopWords = readStringSet(line.getOptionValue("stop"));
dstStopWords = readStringSet(line.getOptionValue("stop"));
} else {
if (line.hasOption("sstop")) {
srcStopWords = readStringSet(line.getOptionValue("sstop"));
}
if (line.hasOption("tstop")) {
dstStopWords = readStringSet(line.getOptionValue("tstop"));
}
}
Set<String> srcSeparators = Analyser.SEPARATORS;
Set<String> dstSeparators = Analyser.SEPARATORS;
if (line.hasOption("sep")) {
srcSeparators = readStringSet(line.getOptionValue("sep"));
dstSeparators = readStringSet(line.getOptionValue("sep"));
} else {
if (line.hasOption("ssep")) {
srcSeparators = readStringSet(line.getOptionValue("ssep"));
}
if (line.hasOption("tsep")) {
dstSeparators = readStringSet(line.getOptionValue("tsep"));
}
}
if (line.hasOption('v')) {
Aligner.setVerbose(true);
System.out.println("general options:");
System.out.printf("tokenizer regex: %s\n", regex);
System.out.printf("source stopwords: %s\n", srcStopWords);
System.out.printf("target stopwords: %s\n", dstStopWords);
System.out.printf("source sentinels: %s\n", srcSeparators);
System.out.printf("target sentinels: %s\n", dstSeparators);
System.out.printf("corpus size used: %d lines\n", corpusSize);
System.out.printf("source depth: %d\n", srcDepth);
System.out.printf("target depth: %d\n", dstDepth);
System.out.printf("source maximal depth: %d\n", srcMaxDepth);
System.out.printf("target maximal depth: %d\n", dstMaxDepth);
System.out.printf("window size: %d\n", windowSize);
System.out.printf("minimal left/right context ratio: %f\n", minLeftRightContextRatio);
System.out.printf("minimal non-empty fillers count : %d\n", minNonEmptyFillerCount);
System.out.printf("delimiter : \"%s\"\n", Analyser.getDelimiter());
}
if (line.hasOption('a') && line.hasOption('s') && line.hasOption('t')) {
double minSimilarity = 0.5;
if (line.hasOption("sim")) {
minSimilarity = Double.parseDouble(line.getOptionValue("sim"));
}
int alignmentsNumber = 3;
if (line.hasOption('n')) {
alignmentsNumber = Integer.parseInt(line.getOptionValue('n'));
}
int minVectorSize = 0;
if (line.hasOption("vsize")) {
minVectorSize = Integer.parseInt(line.getOptionValue("vsize"));
}
IntervalSetBuilder intervalSetBuilder = null;
if (line.hasOption("max")) {
intervalSetBuilder = new NonSingletonBwtSetBuilder();
} else if (line.hasOption("smax")) {
intervalSetBuilder = new SupermaximalSetBuilder();
} else {
intervalSetBuilder = new ProperIntervalSetBuilder();
}
if (line.hasOption('v')) {
System.out.println("alignment options:");
System.out.printf("minimal similarity : %f\n", minSimilarity);
System.out.printf("alignments number: %d\n", alignmentsNumber);
System.out.printf("minimal vector size: %d\n", minVectorSize);
System.out.print("outputing ");
if (intervalSetBuilder instanceof NonSingletonBwtSetBuilder) {
System.out.print("maximal");
} else if (intervalSetBuilder instanceof SupermaximalSetBuilder) {
System.out.print("supermaximal");
} else if (intervalSetBuilder instanceof ProperIntervalSetBuilder) {
System.out.print("all the");
}
System.out.println(" repeats");
}
boolean asLaTeX = false;
if (line.hasOption('x')) {
asLaTeX = true;
}
getAnalyser(line.getOptionValue('s'), line.getOptionValue('t'), corpusSize, regex, srcStopWords,
dstStopWords, srcSeparators, dstSeparators).printRecursiveAlignments(intervalSetBuilder,
minLeftRightContextRatio, minNonEmptyFillerCount, minSimilarity, alignmentsNumber,
minVectorSize, windowSize, srcDepth, srcMaxDepth, dstDepth, dstMaxDepth, asLaTeX);
} else if (line.hasOption('s')
&& (line.hasOption('i') || line.hasOption('r') || line.hasOption('u') || line.hasOption('z'))) {
if (line.hasOption('i')) {
getAnalyser(line.getOptionValue('s'), null, corpusSize, regex, srcStopWords, dstStopWords,
srcSeparators, dstSeparators).printSrcCorpusInfo();
}
if (line.hasOption('r')) {
IntervalSetBuilder intervalSetBuilder = null;
if (line.hasOption("max")) {
intervalSetBuilder = new NonSingletonBwtSetBuilder();
} else if (line.hasOption("smax")) {
intervalSetBuilder = new SupermaximalSetBuilder();
} else {
intervalSetBuilder = new ProperIntervalSetBuilder();
}
getAnalyser(line.getOptionValue('s'), null, corpusSize, regex, srcStopWords, dstStopWords,
srcSeparators, dstSeparators).printSrcRecursiveIntervals(intervalSetBuilder,
minLeftRightContextRatio, minNonEmptyFillerCount, srcDepth, srcMaxDepth, windowSize);
}
if (line.hasOption('z')) {
printTokens(line.getOptionValue('s'), regex, srcStopWords, corpusSize);
}
} else {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("java -jar gaal.jar", options, true);
}
} catch (ParseException exp) {
System.err.println(exp.getMessage());
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("test", options, true);
}
}
}