Package joshua.corpus.alignment

Source Code of joshua.corpus.alignment.AlignmentGrids

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.alignment;

import java.io.File;
import java.io.IOException;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.corpus.Corpus;
import joshua.util.io.BinaryOut;

/**
* List of alignment grids representing all alignment data for an
* aligned parallel corpus.
* <p>
* Instances of this class are created from human-readable alignment
* text files.
*
* @author Lane Schwartz
*/
public class AlignmentGrids extends AbstractAlignmentGrids {

  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(AlignmentGrids.class.getName());
 
  /** List of individual alignment grids. */
  private final List<AlignmentGrid> alignments;
 
  /**
   * Constructs a list of AlignmentGrid objects.
   * <p>
   * The size parameter is used to allocate the initial
   * capacity of the backing list. If this number is off,
   * things will still work, but memory usage may be less
   * optimal.
   * <p>
   * The object returned by this constructor will required
   * tight spans.
   *
   * @param alignmentScanner
   * @param sourceCorpus
   * @param targetCorpus
   * @param expectedSize Expected number of training sentences.
   *            This parameter merely specifies the initial
   *            capacity of an array list.
   */
  public AlignmentGrids(Scanner alignmentScanner, Corpus sourceCorpus, Corpus targetCorpus, int expectedSize) {
    this(alignmentScanner, sourceCorpus, targetCorpus, expectedSize, true);
  }
 
  /**
   * Constructs a list of AlignmentGrid objects.
   * <p>
   * The size parameter is used to allocate the initial
   * capacity of the backing list. If this number is off,
   * things will still work, but memory usage may be less
   * optimal.
   *
   * @param alignmentScanner
   * @param sourceCorpus
   * @param targetCorpus
   * @param expectedSize Expected number of training sentences.
   *            This parameter merely specifies the initial
   *            capacity of an array list.
   * @param requireTightSpans
   */
  public AlignmentGrids(Scanner alignmentScanner, Corpus sourceCorpus, Corpus targetCorpus, int expectedSize, boolean requireTightSpans) {
    super(sourceCorpus, targetCorpus, requireTightSpans);
   
    this.alignments = new ArrayList<AlignmentGrid>(expectedSize);
   
    boolean finest = logger.isLoggable(Level.FINEST);
    int tenthSize = expectedSize / 10;
   
    int lineNumber = 0;
    while (alignmentScanner.hasNextLine()) {
     
      String line = alignmentScanner.nextLine();
     
      try {
          AlignmentGrid grid = new AlignmentGrid(line);
          alignments.add(grid);
      } catch (Exception e) {
          logger.warning("Sentence pair number " + lineNumber + " was too long, skipping this item");
          alignments.add(null);
      }
     
      lineNumber++;
      if (finest && (lineNumber%tenthSize==0)) {
        logger.finest("AlignmentGrids construction " +
            (lineNumber/tenthSize)+"0% complete");
      }
     
    }
  }
 
  /* See Javadoc for AbstractAlignmentGrids. */
  protected int[] getSourcePoints(int sentenceID, int targetSpanStart, int targetSpanEnd) {
    AlignmentGrid grid = alignments.get(sentenceID);
    if(grid != null) {
        return grid.getSourcePoints(targetSpanStart, targetSpanEnd);
    } else {
        return new int[0];
    }
  }
 
  /* See Javadoc for AbstractAlignmentGrids. */
  protected int[] getTargetPoints(int sentenceID, int sourceSpanStart, int sourceSpanEnd) {
    AlignmentGrid grid = alignments.get(sentenceID);
    if(grid != null) {
        return grid.getTargetPoints(sourceSpanStart, sourceSpanEnd);
    } else {
                    return new int[0];
    }
  }

  /**
   * Serializes this object as binary data.
   *
   * @param out The stream to write this object to.
   * @throws IOException Includes any I/O exceptions that may occur
   * @see java.io.Externalizable#writeExternal
   */
  public void writeExternal(ObjectOutput out) throws IOException {
   
    // Start by writing the number of alignments
    int size = alignments.size();
    logger.fine("Exporting size = " + size + ": 1 integer (4 bytes)");
    out.writeInt(size);
   
    // Write the widths of each grid
    logger.fine("Exporting widths: " + size + " integers (" + size*4 + ") bytes");
    for (AlignmentGrid grid : alignments) {
        if(grid != null) {
      out.writeInt(grid.width);
        } else {
      out.writeInt(0);
        }
    }
   
    // Write the widths of each grid
    logger.fine("Exporting widths: " + size + " integers (" + size*4 + ") bytes");
    for (AlignmentGrid grid : alignments) {
        if(grid != null) {
      out.writeInt(grid.height);
        } else {
      out.writeInt(0);
        }
    }
   
    // Write the number of alignment points in each grid
    logger.fine("Exporting pointCounters: " + (size+1) + " integers (" + (size+1)*4 + ") bytes");
    int pointCounter = 0;
    out.writeInt(pointCounter);
    for (AlignmentGrid grid : alignments) {
        if(grid != null) {
      pointCounter += grid.coordinates.length;
      out.writeInt(pointCounter);
        } else {
      out.writeInt(0);
        }
    }
    logger.finer("\tfinal pointCounter value was: " + pointCounter);

   
    // Write the alignment points
    logger.fine("Exporting grid coordinates: " + pointCounter + " shorts (" + pointCounter*2 + ") bytes");
    for (AlignmentGrid grid : alignments) {
        if(grid != null) {
      for (short point : grid.coordinates) {
        out.writeShort(point);
      }
        }
    }
   
    // Write the reverse alignment points
    logger.fine("Exporting reverse grid coordinates: " + pointCounter + " shorts (" + pointCounter*2 + ") bytes");
    for (AlignmentGrid grid : alignments) {
        if(grid != null) {
      for (short point : grid.transposedCoordinates) {
        out.writeShort(point);
      }
        }
    }
   
  }

  /* See Javadoc for Alignments interface. */
  public int size() {
    return this.alignments.size();
  }
 
  /**
   * Main method used to read a human-readable alignments
   * file and write it to disk as binary data.
   *
   * @param args File names for an existing human-readable
   *             alignments file and for the binary data file
   *             to be written
   * @throws IOException Includes any I/O exceptions that may occur
   */
  public static void main(String[] args) throws IOException {
   
    if (args.length != 2) {
      System.err.println("Usage: java " + AlignmentGrids.class.getName() + " alignments alignments.bin");
      System.exit(0);
    }
   
    String alignmentsFileName = args[0];
    String binaryAlignmentsFileName = args[1];

    File alignmentsFile = new File(alignmentsFileName);
    Scanner scanner = new Scanner(alignmentsFile);
   
    AlignmentGrids grids = new AlignmentGrids(scanner, null, null, 10);
   
    BinaryOut out = new BinaryOut(binaryAlignmentsFileName);
    grids.writeExternal(out);
    out.flush();
    out.close();
  }
 
}
TOP

Related Classes of joshua.corpus.alignment.AlignmentGrids

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.