/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import joshua.corpus.AlignedParallelCorpus;
import joshua.corpus.CorpusArray;
import joshua.corpus.ParallelCorpus;
import joshua.corpus.alignment.AlignmentGrids;
import joshua.corpus.lexprob.LexProbs;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.util.Cache;
import joshua.util.io.BinaryOut;
/**
* Compiles a parallel corpus into binary data files.
*
* @author Lane Schwartz
* @version $LastChangedDate: 2010-02-11 15:47:37 -0600 (Thu, 11 Feb 2010) $
*/
public class Compile {
/** Logger for this class. */
private static final Logger logger =
Logger.getLogger(Compile.class.getName());
private String sourceCorpusFileName;
private String targetCorpusFileName;
private String alignmentsFileName;
private String outputDirName;
private String charset = "UTF-8";
private int minFrequency = 0;
private short maxPhrases = 100;
private int maxPhraseLength = JoshuaConfiguration.sa_max_phrase_length;
private int maxPhraseSpan = JoshuaConfiguration.sa_max_phrase_span;
private int minNonterminalSpan = JoshuaConfiguration.sa_min_nonterminal_span;
public void setMinNonterminalSpan(int minNonterminalSpan) {
this.minNonterminalSpan = minNonterminalSpan;
}
public void setMaxPhraseSpan(int maxPhraseSpan) {
this.maxPhraseSpan = maxPhraseSpan;
}
public void setMinFrequency(int minFrequency) {
this.minFrequency = minFrequency;
}
public void setMaxPhrases(short maxPhrases) {
this.maxPhrases = maxPhrases;
}
public void setMaxPhraseLength(int maxPhraseLength) {
this.maxPhraseLength = maxPhraseLength;
}
public void setSourceCorpus(String sourceCorpusFileName) {
this.sourceCorpusFileName = sourceCorpusFileName;
}
public void setTargetCorpus(String targetCorpusFileName) {
this.targetCorpusFileName = targetCorpusFileName;
}
public void setAlignments(String alignmentsFileName) {
this.alignmentsFileName = alignmentsFileName;
}
public void setOutputDir(String outputDirName) {
this.outputDirName = outputDirName;
}
public void setEncoding(String charset) {
this.charset = charset;
}
public void execute() throws IOException {
// Verify that output directory exists or can be created
File outputDir = new File(outputDirName);
if (! outputDir.exists()) {
boolean success = outputDir.mkdirs();
if (! success) {
logger.severe("Output directory does not exist, and could not be successfully created: " + outputDirName);
System.exit(-1);
}
} else if (! outputDir.isDirectory()) {
logger.severe("Output directory exists, but is not a directory: " + outputDirName);
System.exit(-2);
} else if (! outputDirName.endsWith(".josh")) {
logger.warning("By convention, the output directory should end in .josh");
}
// Construct common vocabulary
Vocabulary symbolTable = new Vocabulary();
if (logger.isLoggable(Level.INFO)) logger.info("Adding terminal tokens from file " + sourceCorpusFileName + " to common vocabulary");
int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, symbolTable, false);
if (logger.isLoggable(Level.INFO)) logger.info("Adding terminal tokens from file " + targetCorpusFileName + " to common vocabulary");
int[] targetLengths = Vocabulary.initializeVocabulary(targetCorpusFileName, symbolTable, true);
if (sourceLengths[1] != targetLengths[1]) {
logger.severe("Source corpus and target corpus have different number of sentences (" + sourceLengths[1] + " vs " + targetLengths[1] + ")");
System.exit(-3);
}
int numberOfSentences = sourceLengths[1];
// Write README file to disk
String readmeFilename = outputDirName + File.separator + "README.txt";
PrintStream out = new PrintStream(readmeFilename);
out.println("This directory contains the following binary files:");
out.println();
// Write vocabulary to disk
{
String binaryVocabFilename = outputDirName + File.separator + "common.vocab";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary common vocabulary to disk at " + binaryVocabFilename);
ObjectOutput vocabOut =
new BinaryOut(new FileOutputStream(binaryVocabFilename), true);
symbolTable.setExternalizableEncoding(charset);
symbolTable.writeExternal(vocabOut);
vocabOut.flush();
out.println("Common symbol table for source and target language: " + binaryVocabFilename);
}
// Construct source language corpus
if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + sourceCorpusFileName);
CorpusArray sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, symbolTable, sourceLengths[0], sourceLengths[1]);
// Write source corpus to disk
{
String binarySourceCorpusFilename = outputDirName + File.separator + "source.corpus";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceCorpusFilename);
BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binarySourceCorpusFilename), false);
sourceCorpusArray.writeExternal(corpusOut);
corpusOut.flush();
out.println("Source language corpus: " + binarySourceCorpusFilename);
}
// Construct target language corpus
if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + targetCorpusFileName);
CorpusArray targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetCorpusFileName, symbolTable, targetLengths[0], targetLengths[1]);
// Write target language corpus to disk
{
String binaryTargetCorpusFilename = outputDirName + File.separator + "target.corpus";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetCorpusFilename);
BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binaryTargetCorpusFilename), false);
targetCorpusArray.writeExternal(corpusOut);
corpusOut.flush();
out.println("Target language corpus: " + binaryTargetCorpusFilename);
}
{
// Construct alignments data structure
AlignmentGrids grids = new AlignmentGrids(
new Scanner(new File(alignmentsFileName)),
sourceCorpusArray,
targetCorpusArray,
numberOfSentences);
// Write alignments to disk
{
String binaryAlignmentsFilename = outputDirName + File.separator + "alignment.grids";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary alignment grids to disk at " + binaryAlignmentsFilename);
BinaryOut alignmentsOut = new BinaryOut(binaryAlignmentsFilename);
grids.writeExternal(alignmentsOut);
alignmentsOut.flush();
alignmentsOut.close();
out.println("Source-target alignment grids: " + binaryAlignmentsFilename);
}
// Write lexprobs to disk
{
ParallelCorpus parallelCorpus = new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, grids);
if (logger.isLoggable(Level.INFO)) logger.info("Constructing lexprob table");
LexicalProbabilities lexProbs =
new LexProbs(parallelCorpus, Float.MIN_VALUE);
String lexprobsFilename = outputDirName + File.separator + "lexprobs.txt";
FileOutputStream stream = new FileOutputStream(lexprobsFilename);
OutputStreamWriter lexprobsOut = new OutputStreamWriter(stream, charset);
String binaryLexCountFilename = outputDirName + File.separator + "lexicon.counts";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary lexicon counts to disk at " + binaryLexCountFilename);
// BinaryOut lexCountOut = new BinaryOut(binaryLexCountFilename);
ObjectOutput lexCountOut = new ObjectOutputStream(new FileOutputStream(binaryLexCountFilename));
lexProbs.writeExternal(lexCountOut);
lexCountOut.close();
String s = lexProbs.toString();
if (logger.isLoggable(Level.INFO)) logger.info("Writing lexprobs at " + lexprobsFilename);
lexprobsOut.write(s);
lexprobsOut.flush();
lexprobsOut.close();
out.println("Lexprobs at " + lexprobsFilename);
}
}
// Write target language suffix array to disk
{
// Construct target language suffix array
if (logger.isLoggable(Level.INFO)) logger.info("Constructing suffix array from file " + targetCorpusFileName);
SuffixArray targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, Cache.DEFAULT_CAPACITY);
String binaryTargetSuffixesFilename = outputDirName + File.separator + "target.suffixes";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetSuffixesFilename);
BinaryOut suffixesOut = new BinaryOut(new FileOutputStream(binaryTargetSuffixesFilename), false);
targetSuffixArray.writeExternal(suffixesOut);
suffixesOut.flush();
out.println("Target language suffix array: " + binaryTargetSuffixesFilename);
}
{
// Construct source language suffix array
if (logger.isLoggable(Level.INFO)) logger.info("Constructing suffix array from file " + sourceCorpusFileName);
SuffixArray sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, Cache.DEFAULT_CAPACITY);
// Write source language suffix array to disk
{
String binarySourceSuffixesFilename = outputDirName + File.separator + "source.suffixes";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceSuffixesFilename);
BinaryOut suffixesOut = new BinaryOut(new FileOutputStream(binarySourceSuffixesFilename), false);
sourceSuffixArray.writeExternal(suffixesOut);
suffixesOut.flush();
out.println("Source language suffix array: " + binarySourceSuffixesFilename);
}
// Precompute and write frequent phrase locations to disk
{
if (logger.isLoggable(Level.INFO)) logger.info("Precomputing indices for most frequent phrases");
FrequentPhrases frequentPhrases =
new FrequentPhrases(sourceSuffixArray, minFrequency, maxPhrases, maxPhraseLength, maxPhraseLength, maxPhraseSpan, minNonterminalSpan);
String frequentPhrasesFilename = outputDirName + File.separator + "frequentPhrases";
if (logger.isLoggable(Level.INFO)) logger.info("Writing precomputing indices for most frequent phrases at " + frequentPhrasesFilename);
BinaryOut frequentPhrasesOut = new BinaryOut(frequentPhrasesFilename);
frequentPhrases.writeExternal(frequentPhrasesOut);
frequentPhrasesOut.close();
}
}
out.flush();
out.close();
if (logger.isLoggable(Level.INFO)) logger.info("Completed writing binary files to disk");
}
public static void main(String[] args) throws IOException {
if (args.length < 4) {
System.err.println("Usage: java " + Compile.class.getName() + " sourceCorpus targetCorpus alignmentsFile outputDir.josh");
System.exit(0);
}
Compile compiler = new Compile();
compiler.setSourceCorpus(args[0]);
compiler.setTargetCorpus(args[1]);
compiler.setAlignments(args[2]);
compiler.setOutputDir(args[3]);
if (args.length > 4) compiler.setEncoding(args[4]);
compiler.execute();
}
}