Source Code of joshua.corpus.lexprob.ExtractWordPairs

/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.corpus.lexprob;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.Arrays;
import java.util.HashSet;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;


import joshua.util.CommandLineParser;
import joshua.util.CommandLineParser.Option;


/**
 * Utility to extract aligned word pairs from an aligned corpus.
 * <p>
 * The files used must use Unix-style newlines. 
 * 
 * @author Lane Schwartz
 * @version $LastChangedDate: 2009-05-22 23:31:12 -0500 (Fri, 22 May 2009) $
 * @see "Section 4.4 of 'Statistical Phrase-Based Translation' by
 *      Philipp Koehn, Franz Josef Och, & Daniel Marcu (HLT-NAACL, 2003)"
 */
public class ExtractWordPairs {


  private static final Logger logger = Logger.getLogger(ExtractWordPairs.class.getName());
  
  /** Special marker to use with unaligned words */
  public static final String UNALIGNED_MARKER = "NULL";
  
  /**
   * Extract aligned word pairs from an aligned corpus.
   * <p>
   * This method does not convert from upper case to lower
   * case. All input needs to already be in the proper case.
   * <p>
   * NOTE: The scanners provided for source text, target text,
   * and alignments must all be backed by data that uses
   * Unix-style newlines.
   * 
   * @param number_of_lines The number of lines to process
   *                    from the aligned corpus.
   * @param source_text Scanner backed by the source language text
   * @param target_text Scanner backed by the target language text
   * @param alignments  Scanner backed by the sentence alignment data
   * @param outputFile  Writer to use when producing output results
   * @throws IOException Thrown if an I/O error occurs when writing results
   */
  public static void extract(int number_of_lines, Scanner source_text, Scanner target_text, Scanner alignments, Writer outputFile) throws IOException {
    
    if (logger.isLoggable(Level.INFO)) {
      logger.info("Extracting aligned word pairs from aligned sentences...");
    }
    
    // Iterate over all lines of input
    for (int line_number=1; line_number<=number_of_lines; line_number++) {


      // Read in the next line from the files
      // BUG: use joshua.util.Regex.spaces.split(...)
      String[] source_words = source_text.nextLine().split("\\s+");
      String[] target_words = target_text.nextLine().split("\\s+");
      String[] raw_alignment_points = alignments.nextLine().split("\\s+");


      try {
        // We have a new sentence pair.
        //    Initially assume that all words are unaligned.
        //    As each alignment point is processed, aligned words will be removed from the appropriate set
        Set<Integer> unaligned_source_words = new HashSet<Integer>(source_words.length);
        Set<Integer> unaligned_target_words = new HashSet<Integer>(target_words.length);


        for (int i=0; i<source_words.length; i++) { unaligned_source_words.add(i); }
        for (int i=0; i<target_words.length; i++) { unaligned_target_words.add(i); }


        // Iterate over each alignment point in the aligned sentence pair
        for (String raw_alignment_point : raw_alignment_points) {


          // Alignment points must be of the format #-#, where # is a number
          int split_point = raw_alignment_point.indexOf('-');


          int x = Integer.valueOf(raw_alignment_point.substring(0,split_point));
          int y = Integer.valueOf(raw_alignment_point.substring(split_point+1));


          // Remove this source word from the set of unaligned source words
          unaligned_source_words.remove(x);


          // Remove this target word from the set of unaligned target words
          unaligned_target_words.remove(y);




          // Lowercase the words,
          //    then print the word pair to the output file
          outputFile.write(source_words[x].toLowerCase() + " " + target_words[y].toLowerCase() + "\n");


        }    


        // For each unaligned source word,
        //    lowercase the word,
        //    then print the word, aligned with the special token NULL
        for (int source_word_index : unaligned_source_words) {
          outputFile.write(source_words[source_word_index].toLowerCase() + " " + UNALIGNED_MARKER + "\n");          
        }


        // For each unaligned target word,
        //    lowercase the word,
        //    then print the word, aligned with the special token NULL
        for (int target_word_index : unaligned_target_words) {
          outputFile.write(UNALIGNED_MARKER + " " + target_words[target_word_index].toLowerCase() + "\n");          
        }
      } catch (ArrayIndexOutOfBoundsException e) {
        if (logger.isLoggable(Level.SEVERE)) {
          logger.severe("ArrayIndexOutOfBoundsException at sentence pair:\n" + Arrays.toString(source_words) + "\n"+Arrays.toString(target_words) +"\n"+Arrays.toString(raw_alignment_points) + "\n");
        }
        throw e;
      }
        


    }


    // Tidy up
    outputFile.flush();
    outputFile.close();


    if (logger.isLoggable(Level.INFO)) {
      logger.info("...done.");
    }
  }
  
  /**
   * Utility to extract aligned word pairs from an aligned
   * corpus.
   * 
   * @param args Command line arguments
   */
  public static void main(String[] args) {


    CommandLineParser commandLine = new CommandLineParser();
    
    Option<String> source_file = commandLine.addStringOption('s',"source-text","SOURCE_FILENAME","name of file containing source language corpus");
    //Option<String> source_file_encoding = commandLine.addStringOption("source-encoding","SOURCE_ENCODING","ISO-8859-1","source language file encoding");
    Option<String> source_file_encoding = commandLine.addStringOption("source-encoding","SOURCE_ENCODING","UTF-8","source language file encoding");
    Option<Boolean> source_file_gz = commandLine.addBooleanOption("source-text-gzipped",false,"is the source text gzipped");
    
    Option<String> target_file = commandLine.addStringOption('t',"target-text","TARGET_FILENAME","name of file containing target language corpus");
    //Option<String> target_file_encoding = commandLine.addStringOption("target-encoding","TARGET_ENCODING","ISO-8859-1","target language file encoding");
    Option<String> target_file_encoding = commandLine.addStringOption("target-encoding","TARGET_ENCODING","UTF-8","target language file encoding");
    Option<Boolean> target_file_gz = commandLine.addBooleanOption("target-text-gzipped",false,"is the target text gzipped");
    
    Option<String> alignment_file = commandLine.addStringOption('a',"alignment","ALIGNMENT_FILENAME","name of file containing word alignments for the sentences in the corpus");
    Option<Boolean> alignment_file_gz = commandLine.addBooleanOption("alignment-file-gzipped",false,"is the alignment file gzipped");


    Option<Integer> num_lines = commandLine.addIntegerOption('l',"lines","LINE_COUNT","number of aligned sentences in the corpus");
    
    Option<String> output_file = commandLine.addStringOption('o',"output","OUTPUT_FILENAME","file where aligned word pairs will be written");
    Option<String> output_file_encoding = commandLine.addStringOption("output-encoding","OUTPUT_ENCODING","UTF-8","output file encoding");
    Option<Boolean> output_file_gz = commandLine.addBooleanOption("output-text-gzipped",false,"should the output file be gzipped");
    
    commandLine.parse(args);
    
    
    try {
      
      // Set System.out and System.err to use the provided character encoding
      try {
        System.setOut(new PrintStream(System.out, true, commandLine.getValue(source_file_encoding)));
        System.setErr(new PrintStream(System.err, true, commandLine.getValue(source_file_encoding)));
      } catch (UnsupportedEncodingException e1) {
        System.err.println(commandLine.getValue(source_file_encoding) + " is not a valid encoding; using system default encoding for System.out and System.err.");
      } catch (SecurityException e2) {
        System.err.println("Security manager is configured to disallow changes to System.out or System.err; using system default encoding.");
      }
      
      // The number of lines to read
      int number_of_lines = commandLine.getValue(num_lines);


      // Set up the source text for reading
      Scanner source_text;
      if (commandLine.getValue(source_file).endsWith(".gz") || commandLine.getValue(source_file_gz)) {
        source_text = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(source_file))),commandLine.getValue(source_file_encoding))));
      } else {
        source_text = new Scanner( new File(commandLine.getValue(source_file)), commandLine.getValue(source_file_encoding));
      }
      
      // Set up the target text for reading
      Scanner target_text;
      if (commandLine.getValue(target_file).endsWith(".gz") || commandLine.getValue(target_file_gz)) {
        target_text = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(target_file))),commandLine.getValue(target_file_encoding))));
      } else {
        target_text = new Scanner( new File(commandLine.getValue(target_file)), commandLine.getValue(target_file_encoding));
      }
      
      // Set up the alignment file for reading
      Scanner alignments;
      if (commandLine.getValue(alignment_file).endsWith(".gz") || commandLine.getValue(alignment_file_gz)) {
        alignments = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(alignment_file))))));
      } else {
        alignments = new Scanner( new File(commandLine.getValue(alignment_file)));
      }
      
      
      // Set up the output file for writing
      Writer outputFile;
      if (commandLine.getValue(output_file).endsWith(".gz") || commandLine.getValue(output_file_gz)) {
        outputFile = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(commandLine.getValue(output_file))),commandLine.getValue(output_file_encoding));
      } else {
        outputFile = new OutputStreamWriter(new FileOutputStream(commandLine.getValue(output_file)),commandLine.getValue(output_file_encoding));
      }
      
      try {
        extract(number_of_lines, source_text, target_text, alignments, outputFile);
      } catch (NoSuchElementException e) {
        System.err.println("There are more than " + number_of_lines + " lines of input. Please determine the actual number of lines of input, and re-run with the appropriate command line flag.");
        commandLine.printUsage();
        System.exit(-1);
      }


    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }


}
Source Code of joshua.corpus.lexprob.ExtractWordPairs

Related Classes of joshua.corpus.lexprob.ExtractWordPairs