Source Code of joshua.corpus.suffix_array.Compile

/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.corpus.suffix_array;


import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;


import joshua.corpus.AlignedParallelCorpus;
import joshua.corpus.CorpusArray;
import joshua.corpus.ParallelCorpus;
import joshua.corpus.alignment.AlignmentGrids;
import joshua.corpus.lexprob.LexProbs;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.util.Cache;
import joshua.util.io.BinaryOut;


/**
 * Compiles a parallel corpus into binary data files.
 * 
 * @author Lane Schwartz
 * @version $LastChangedDate: 2010-02-11 15:47:37 -0600 (Thu, 11 Feb 2010) $
 */
public class Compile {


  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(Compile.class.getName());
  
  
  private String sourceCorpusFileName;
  private String targetCorpusFileName;
  private String alignmentsFileName;
  private String outputDirName;
  private String charset = "UTF-8";
  
  private int minFrequency = 0;
  private short maxPhrases = 100;


  private int maxPhraseLength = JoshuaConfiguration.sa_max_phrase_length;
  
  private int maxPhraseSpan = JoshuaConfiguration.sa_max_phrase_span;
  
  private int minNonterminalSpan = JoshuaConfiguration.sa_min_nonterminal_span;
  
  public void setMinNonterminalSpan(int minNonterminalSpan) {
    this.minNonterminalSpan = minNonterminalSpan;
  }
  
  public void setMaxPhraseSpan(int maxPhraseSpan) {
    this.maxPhraseSpan = maxPhraseSpan;
  }
  
  public void setMinFrequency(int minFrequency) {
    this.minFrequency = minFrequency;
  }
  
  public void setMaxPhrases(short maxPhrases) {
    this.maxPhrases = maxPhrases;
  }
  
  public void setMaxPhraseLength(int maxPhraseLength) {
    this.maxPhraseLength = maxPhraseLength;
  }
  
  public void setSourceCorpus(String sourceCorpusFileName) {
    this.sourceCorpusFileName = sourceCorpusFileName;
  }
  
  public void setTargetCorpus(String targetCorpusFileName) {
    this.targetCorpusFileName = targetCorpusFileName;
  }
  
  public void setAlignments(String alignmentsFileName) {
    this.alignmentsFileName = alignmentsFileName;
  }
  
  
  public void setOutputDir(String outputDirName) {
    this.outputDirName = outputDirName;
  }
  
  public void setEncoding(String charset) {
    this.charset = charset;
  }
  
  public void execute() throws IOException {
    // Verify that output directory exists or can be created
    File outputDir = new File(outputDirName);
    if (! outputDir.exists()) {
      boolean success = outputDir.mkdirs();
      if (! success) {
        logger.severe("Output directory does not exist, and could not be successfully created: " + outputDirName);
        System.exit(-1);
      }
    } else if (! outputDir.isDirectory()) {
      logger.severe("Output directory exists, but is not a directory: " + outputDirName);
      System.exit(-2);
    } else if (! outputDirName.endsWith(".josh")) {
      logger.warning("By convention, the output directory should end in .josh");
    }
    
    
    // Construct common vocabulary
    Vocabulary symbolTable = new Vocabulary();
    if (logger.isLoggable(Level.INFO)) logger.info("Adding terminal tokens from file " + sourceCorpusFileName + " to common vocabulary");
    int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, symbolTable, false);
    if (logger.isLoggable(Level.INFO)) logger.info("Adding terminal tokens from file " + targetCorpusFileName + " to common vocabulary");
    int[] targetLengths = Vocabulary.initializeVocabulary(targetCorpusFileName, symbolTable, true);
    
    if (sourceLengths[1] != targetLengths[1]) {
      logger.severe("Source corpus and target corpus have different number of sentences (" + sourceLengths[1] + " vs " + targetLengths[1] + ")");
      System.exit(-3);
    }
    int numberOfSentences = sourceLengths[1];
    
    // Write README file to disk
    String readmeFilename = outputDirName + File.separator + "README.txt";
    PrintStream out = new PrintStream(readmeFilename);


    out.println("This directory contains the following binary files:");
    out.println();


    
    
    // Write vocabulary to disk
    {
      String binaryVocabFilename = outputDirName + File.separator + "common.vocab";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary common vocabulary to disk at " + binaryVocabFilename);
      
      ObjectOutput vocabOut =
          new BinaryOut(new FileOutputStream(binaryVocabFilename), true);
      symbolTable.setExternalizableEncoding(charset);
        symbolTable.writeExternal(vocabOut);
        vocabOut.flush();
        
      out.println("Common symbol table for source and target language: " + binaryVocabFilename);
    }
    
    
    
    // Construct source language corpus
    if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + sourceCorpusFileName);
    CorpusArray sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, symbolTable, sourceLengths[0], sourceLengths[1]);
    
    // Write source corpus to disk
    {
      String binarySourceCorpusFilename = outputDirName + File.separator + "source.corpus";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceCorpusFilename);
      
        BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binarySourceCorpusFilename), false);
        sourceCorpusArray.writeExternal(corpusOut);  
        corpusOut.flush();
        
      out.println("Source language corpus: " + binarySourceCorpusFilename);
    }
    
    // Construct target language corpus
    if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + targetCorpusFileName);
    CorpusArray targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetCorpusFileName, symbolTable, targetLengths[0], targetLengths[1]);
    
    
    // Write target language corpus to disk
    {
      String binaryTargetCorpusFilename = outputDirName + File.separator + "target.corpus";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetCorpusFilename);
      
        BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binaryTargetCorpusFilename), false);
        targetCorpusArray.writeExternal(corpusOut);  
        corpusOut.flush();
        
        out.println("Target language corpus: " + binaryTargetCorpusFilename);
    }
    
    {
      // Construct alignments data structure
      AlignmentGrids grids = new AlignmentGrids(
          new Scanner(new File(alignmentsFileName)), 
          sourceCorpusArray, 
          targetCorpusArray,
          numberOfSentences);


      // Write alignments to disk
      {
        String binaryAlignmentsFilename = outputDirName + File.separator + "alignment.grids";
        if (logger.isLoggable(Level.INFO)) logger.info("Writing binary alignment grids to disk at " + binaryAlignmentsFilename);


        BinaryOut alignmentsOut = new BinaryOut(binaryAlignmentsFilename);
        grids.writeExternal(alignmentsOut);
        alignmentsOut.flush();
        alignmentsOut.close();


        out.println("Source-target alignment grids: " + binaryAlignmentsFilename);
      }


      // Write lexprobs to disk
      {
        ParallelCorpus parallelCorpus = new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, grids);


        if (logger.isLoggable(Level.INFO)) logger.info("Constructing lexprob table");
        LexicalProbabilities lexProbs = 
          new LexProbs(parallelCorpus, Float.MIN_VALUE);


        String lexprobsFilename = outputDirName + File.separator + "lexprobs.txt";
        FileOutputStream stream = new FileOutputStream(lexprobsFilename);
        OutputStreamWriter lexprobsOut = new OutputStreamWriter(stream, charset);


        String binaryLexCountFilename = outputDirName + File.separator + "lexicon.counts";
        if (logger.isLoggable(Level.INFO)) logger.info("Writing binary lexicon counts to disk at " + binaryLexCountFilename);


        //      BinaryOut lexCountOut = new BinaryOut(binaryLexCountFilename);
        ObjectOutput lexCountOut = new ObjectOutputStream(new FileOutputStream(binaryLexCountFilename));
        lexProbs.writeExternal(lexCountOut);
        lexCountOut.close();


        String s = lexProbs.toString();


        if (logger.isLoggable(Level.INFO)) logger.info("Writing lexprobs at " + lexprobsFilename);
        lexprobsOut.write(s);  
        lexprobsOut.flush();
        lexprobsOut.close();
        out.println("Lexprobs at " + lexprobsFilename);


      }
    }
    
    // Write target language suffix array to disk
    {
      // Construct target language suffix array
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing suffix array from file " + targetCorpusFileName);
      SuffixArray targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, Cache.DEFAULT_CAPACITY);
      
      String binaryTargetSuffixesFilename = outputDirName + File.separator + "target.suffixes";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetSuffixesFilename);
      
      BinaryOut suffixesOut = new BinaryOut(new FileOutputStream(binaryTargetSuffixesFilename), false);
      targetSuffixArray.writeExternal(suffixesOut);  
        suffixesOut.flush();
        
      out.println("Target language suffix array: " + binaryTargetSuffixesFilename);
    }
    
    
    {
      // Construct source language suffix array
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing suffix array from file " + sourceCorpusFileName);
      SuffixArray sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, Cache.DEFAULT_CAPACITY);


      // Write source language suffix array to disk
      {
        String binarySourceSuffixesFilename = outputDirName + File.separator + "source.suffixes";
        if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceSuffixesFilename);


        BinaryOut suffixesOut = new BinaryOut(new FileOutputStream(binarySourceSuffixesFilename), false);
        sourceSuffixArray.writeExternal(suffixesOut);  
        suffixesOut.flush();


        out.println("Source language suffix array: " + binarySourceSuffixesFilename);
      }


      // Precompute and write frequent phrase locations to disk
      {
        if (logger.isLoggable(Level.INFO)) logger.info("Precomputing indices for most frequent phrases");
        FrequentPhrases frequentPhrases = 
          new FrequentPhrases(sourceSuffixArray, minFrequency, maxPhrases, maxPhraseLength, maxPhraseLength, maxPhraseSpan, minNonterminalSpan);


        String frequentPhrasesFilename = outputDirName + File.separator + "frequentPhrases";
        if (logger.isLoggable(Level.INFO)) logger.info("Writing precomputing indices for most frequent phrases at " + frequentPhrasesFilename);
        BinaryOut frequentPhrasesOut = new BinaryOut(frequentPhrasesFilename);
        frequentPhrases.writeExternal(frequentPhrasesOut);
        frequentPhrasesOut.close();
      }
    }
    
    out.flush();
    out.close();
    
    if (logger.isLoggable(Level.INFO)) logger.info("Completed writing binary files to disk");
    
  }
  
  public static void main(String[] args) throws IOException {
    
    if (args.length < 4) {
      System.err.println("Usage: java " + Compile.class.getName() + " sourceCorpus targetCorpus alignmentsFile outputDir.josh");
      System.exit(0);
    }
    
    Compile compiler = new Compile();
    compiler.setSourceCorpus(args[0]);
    compiler.setTargetCorpus(args[1]);
    compiler.setAlignments(args[2]);
    compiler.setOutputDir(args[3]);
    if (args.length > 4) compiler.setEncoding(args[4]);
    
    compiler.execute();
  }
  
}
Source Code of joshua.corpus.suffix_array.Compile

Related Classes of joshua.corpus.suffix_array.Compile