Package joshua.corpus.lexprob

Source Code of joshua.corpus.lexprob.LexProbs

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.lexprob;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.corpus.Corpus;
import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.ParallelCorpus;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.suffix_array.HierarchicalPhrase;
import joshua.corpus.vocab.SymbolTable;
import joshua.util.Counts;
import joshua.util.Pair;

/**
* Represents lexical probability distributions in both directions.
* <p>
* This class calculates the probabilities from sorted word pair
* counts.
*
* @author Lane Schwartz
* @version $LastChangedDate: 2010-02-10 09:59:38 -0600 (Wed, 10 Feb 2010) $
*/
public class LexProbs extends AbstractLexProbs {

  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(LexProbs.class.getName());
 
  /** Source language symbol table. */
  protected final SymbolTable sourceVocab;
 
  /** Target language symbol table. */
  protected final SymbolTable targetVocab;
 
  /** Aligned parallel corpus. */
  protected final ParallelCorpus parallelCorpus;
 
  /**
   * The probability returned when no calculated lexical
   * translation probability is known.
   */
  protected float floorProbability;
 
  /** Word co-occurrence counts from the parallel corpus. */
  protected Counts<Integer,Integer> counts;
 
 
  /**
   * Constructs lexical translation probabilities from a
   * parallel corpus.
   *
   * @param parallelCorpus Aligned parallel corpus
   * @param floorProbability
   */
  public LexProbs(ParallelCorpus parallelCorpus, float floorProbability) {
   
    logger.info("Calculating lexical translation probability table");
    this.counts = initializeCooccurrenceCounts(parallelCorpus, floorProbability);
   
    this.sourceVocab = parallelCorpus.getSourceCorpus().getVocabulary();
    this.targetVocab = parallelCorpus.getTargetCorpus().getVocabulary();
   
    this.parallelCorpus = parallelCorpus;
    this.floorProbability = floorProbability; //Float.MIN_VALUE;
    logger.info("Calculating lexical translation probability table");
   
  }

  /**
   * Constructs lexical translation probabilities from a
   * parallel corpus.
   *
   * @param parallelCorpus Aligned parallel corpus
   * @param ObjectIn
   */
  public LexProbs(ParallelCorpus parallelCorpus, String lexCountsFileName) {
   
//    logger.info("Calculating lexical translation probability table");
//    this.counts = initializeCooccurrenceCounts(parallelCorpus, floorProbability);
   
    this.sourceVocab = parallelCorpus.getSourceCorpus().getVocabulary();
    this.targetVocab = parallelCorpus.getTargetCorpus().getVocabulary();
   
    this.parallelCorpus = parallelCorpus;
   
//    File lexCounts = new File(lexCountsFileName);
    this.counts = new Counts<Integer, Integer>();
    this.floorProbability = Float.MIN_VALUE;
//    File lexCounts = new File(lexCountsFileName);
//    if (!lexCounts.exists()) {
//    } else {
     
    try {
      ObjectInput in = new ObjectInputStream(new FileInputStream(lexCountsFileName));
//      readExternal(in);
      logger.info("Reading lexical translation probability table");
      readExternal(in);
      in.close();
    } catch (Exception e) {
      logger.info("Calculating lexical translation probability table");
      this.counts = initializeCooccurrenceCounts(parallelCorpus, floorProbability);
      // TODO Auto-generated catch block
//      e.printStackTrace();
    }
     
//    }


   
//    logger.info("Calculating lexical translation probability table");
   
  }
  /**
   * Gets co-occurrence counts from a parallel corpus.
   *
   * @param parallelCorpus Aligned parallel corpus
   * @param floorProbability
   * @return Word co-occurrence counts from the parallel
   *         corpus.
   */
  private static Counts<Integer,Integer> initializeCooccurrenceCounts(ParallelCorpus parallelCorpus, float floorProbability) {
   
    if (logger.isLoggable(Level.FINE)) {
      logger.fine("Counting word co-occurrence from parallel corpus. Using floor probability " + floorProbability);
    }
   
    Alignments alignments = parallelCorpus.getAlignments();
    Corpus sourceCorpus = parallelCorpus.getSourceCorpus();
    Corpus targetCorpus = parallelCorpus.getTargetCorpus();
    int numSentences = parallelCorpus.getNumSentences();
   
    Counts<Integer,Integer> counts = new Counts<Integer,Integer>(floorProbability);
   
    // Iterate over each sentence
    for (int sentenceID=0; sentenceID<numSentences; sentenceID++) {

      int sourceStart = sourceCorpus.getSentencePosition(sentenceID);
      int sourceEnd = sourceCorpus.getSentenceEndPosition(sentenceID);

      int targetStart = targetCorpus.getSentencePosition(sentenceID);
      int targetEnd = targetCorpus.getSentenceEndPosition(sentenceID);

      // Iterate over each word in the source sentence
      for (int sourceIndex=sourceStart; sourceIndex<sourceEnd; sourceIndex++) {

        // Get the token for the current source word
        int sourceWord = sourceCorpus.getWordID(sourceIndex);
       
        // Get the target indices aligned to this source word
        int[] targetPoints = alignments.getAlignedTargetIndices(sourceIndex);
       
        // If the source word is unaligned,
        // then we treat it as being aligned to a special NULL token;
        // we use Java's null to represent the NULL token
        if (targetPoints==null) {
         
          counts.incrementCount(sourceWord, null);
         
        } else {
         
          // If the source word is aligned,
          // then we must iterate over each aligned target point
          for (int targetPoint : targetPoints) {

            int targetWord = targetCorpus.getWordID(targetPoint);

            counts.incrementCount(sourceWord, targetWord);
          }
        }
       
      }
     
      // Iterate over each word in the target sentence
      for (int targetIndex=targetStart; targetIndex<targetEnd; targetIndex++) {

        // Get the token for the current source word
        int targetWord = targetCorpus.getWordID(targetIndex);
       
        // Get the source indices aligned to this target word
        int[] sourcePoints = alignments.getAlignedSourceIndices(targetIndex);
       
        // If the source word is unaligned,
        // then we treat it as being aligned to a special NULL token;
        // we use Java's null to represent the NULL token
        if (sourcePoints==null) {
         
          counts.incrementCount(null, targetWord);
         
        }
      }
     
    }
   
    return counts;
  }
 
 
  /* See Javadoc for LexicalProbabilities#sourceGivenTarget(Integer,Integer). */
  public float sourceGivenTarget(Integer sourceWord, Integer targetWord) {
    return counts.getProbability(sourceWord, targetWord);
  }
 
  /* See Javadoc for LexicalProbabilities#targetGivenSource(Integer,Integer). */
  public float targetGivenSource(Integer targetWord, Integer sourceWord) {
    return counts.getReverseProbability(targetWord, sourceWord);
  }
 
  /* See Javadoc for LexicalProbabilities#sourceGivenTarget(String,String). */
  public float sourceGivenTarget(String sourceWord, String targetWord) {
    Integer targetID = (targetWord==null) ? null : targetVocab.getID(targetWord);
    Integer sourceID = (sourceWord==null) ? null : sourceVocab.getID(sourceWord);
   
    return sourceGivenTarget(sourceID, targetID);
  }
 
  /* See Javadoc for LexicalProbabilities#targetGivenSource(String,String). */
  public float targetGivenSource(String targetWord, String sourceWord) {
    int targetID = (targetWord==null) ? null : targetVocab.getID(targetWord);
    int sourceID = (sourceWord==null) ? null : sourceVocab.getID(sourceWord);
   
    return targetGivenSource(targetID, sourceID);
  }
 
  /* See Javadoc for LexicalProbabilities#lexProbSourceGivenTarget(MatchedHierarchicalPhrases,int,HierarchicalPhrase). */
  public float lexProbSourceGivenTarget(MatchedHierarchicalPhrases sourcePhrases, int sourcePhraseIndex, HierarchicalPhrase targetPhrase) {
   
    float sourceGivenTarget = 1.0f;
   
    Corpus sourceCorpus = parallelCorpus.getSourceCorpus();
    Corpus targetCorpus = parallelCorpus.getTargetCorpus();
    Alignments alignments = parallelCorpus.getAlignments();
   
    // Iterate over each terminal sequence in the source phrase
    for (int seq=0; seq<sourcePhrases.getNumberOfTerminalSequences(); seq++) {
     
      // Iterate over each source index in the current terminal sequence
      for (int sourceWordIndex=sourcePhrases.getTerminalSequenceStartIndex(sourcePhraseIndex, seq),
            end=sourcePhrases.getTerminalSequenceEndIndex(sourcePhraseIndex, seq);
          sourceWordIndex<end;
          sourceWordIndex++) {
       
               
        int sourceWord = sourceCorpus.getWordID(sourceWordIndex);
        int[] targetIndices = alignments.getAlignedTargetIndices(sourceWordIndex);
       
        float sum = 0.0f;
        float average;
       
        if (targetIndices==null) {
         
          sum += this.sourceGivenTarget(sourceWord, null);
          average = sum;
         
        } else {
          for (int targetIndex : targetIndices) {

            int targetWord = targetCorpus.getWordID(targetIndex);
            sum += sourceGivenTarget(sourceWord, targetWord);
           
          }
          average = sum / targetIndices.length;
        }
       
        sourceGivenTarget *= average;
      }
     
    }
   
    return sourceGivenTarget;
  }

  /* See Javadoc for LexicalProbabilities#lexProbTargetGivenSource(MatchedHierarchicalPhrases,int,HierarchicalPhrase). */
  public float lexProbTargetGivenSource(MatchedHierarchicalPhrases sourcePhrases, int sourcePhraseIndex, HierarchicalPhrase targetPhrase) {
   
    final boolean LOGGING_FINEST = logger.isLoggable(Level.FINEST);
   
    Corpus sourceCorpus = parallelCorpus.getSourceCorpus();
    Corpus targetCorpus = parallelCorpus.getTargetCorpus();
    Alignments alignments = parallelCorpus.getAlignments();
   
    StringBuilder s;
    if (LOGGING_FINEST) {
      s = new StringBuilder();
      s.append("lexProb( ");
      s.append(sourcePhrases.getPattern().toString());
      s.append(" | ");
      s.append(targetPhrase.toString());
      s.append(" )  =  1.0");
    } else {
      s = null;
    }
   
    float targetGivenSource = 1.0f;

    // Iterate over each terminal sequence in the target phrase
    for (int seq=0; seq<targetPhrase.getNumberOfTerminalSequences(); seq++) {
     
      // Iterate over each source index in the current terminal sequence
      for (int targetWordIndex=targetPhrase.getTerminalSequenceStartIndex(seq),
            end=targetPhrase.getTerminalSequenceEndIndex(seq);
          targetWordIndex<end;
          targetWordIndex++) {
       
        int targetWord = targetCorpus.getWordID(targetWordIndex);
        int[] sourceIndices = alignments.getAlignedSourceIndices(targetWordIndex);
       
        float sum = 0.0f;
        float average;
       
        if (LOGGING_FINEST) s.append(" * (");
               
        if (sourceIndices==null) {

          sum += targetGivenSource(targetWord, null);
          average = sum;
          if (LOGGING_FINEST) s.append(sum);
         
        } else {
         
          for (int sourceIndex : sourceIndices) {

            int sourceWord = sourceCorpus.getWordID(sourceIndex);
            float value = targetGivenSource(targetWord, sourceWord);
            sum += value;
            if (LOGGING_FINEST) {
              s.append('+');
              s.append(value);
            }
          }
          average = sum / sourceIndices.length;
        }

        if (LOGGING_FINEST) s.append(')');
        targetGivenSource *= average;
       
      }
     
    }
   
    if (LOGGING_FINEST) logger.finest(s.toString());
   
    return targetGivenSource;
  }

  /* See Javadoc for LexicalProbabilities#getFloorProbability. */
  public float getFloorProbability() {
    return floorProbability;
  }
 
  /**
   * Gets a string representation of the lexical probabilities.
   * <p>
   * The returned string will have one line per word pair.
   * The pairs are not guaranteed to be returned in any particular order.
   *
   * @return a string representation of the lexical probabilities
   */
  @Override
  public String toString() {
   
    StringBuilder s = new StringBuilder();
   
    for (Pair<Integer,Integer> pair : counts) {

      Integer sourceID = pair.first;
      Integer targetID = pair.second;
     
      if (sourceID==null) {
        s.append("NULL");
      } else {
        s.append(sourceVocab.getWord(sourceID));
      }
     
      s.append(' ');
     
      if (targetID==null) {
        s.append("NULL");
      } else {
        s.append(targetVocab.getWord(targetID));
      }
     
      s.append(' ');
      s.append(targetGivenSource(targetID,sourceID));
     
      s.append(' ');
      s.append(sourceGivenTarget(sourceID,targetID));
     
      s.append('\n');
    }
   
    return s.toString();
  }
 
  public void writeExternal(ObjectOutput out) throws IOException {
   
    counts.writeExternal(out);
   
  }
 
  public void readExternal(ObjectInput in) throws IOException,
  ClassNotFoundException {
    /*
    Map<Integer, Map<Integer, Integer>> ctMap =
      (HashMap<Integer,Map<Integer,Integer>>) in.readObject();
    counts.setCounts(ctMap);
   
    // Read bTotals

    Map<Integer, Integer> btMap =
      (HashMap<Integer,Integer>) in.readObject();
   
    counts.setBTotals(btMap);
   
    // Read probabilities
    Map<Integer, Map<Integer, Float>> pbMap =
      (HashMap<Integer,Map<Integer,Float>>) in.readObject();
    counts.setProbabilities(pbMap);
   
    // Read reverse probabilities
    Map<Integer, Map<Integer, Float>> rpMap =
      (HashMap<Integer,Map<Integer,Float>>) in.readObject();
    counts.setProbabilities(rpMap);
    */
   
    this.counts.readExternal(in);
    floorProbability = counts.getFloorProbability();
   

  }
 
  public SymbolTable getSourceVocab() {
    return sourceVocab;
  }
 
  public SymbolTable getTargetVocab() {
    return targetVocab;
  }
 
  /**
   * Gets the word co-occurrence counts for this object.
   *
   * @return the word co-occurrence counts for this object.
   */
  protected Counts<Integer,Integer> getCounts() {
    return this.counts;
  }
}
TOP

Related Classes of joshua.corpus.lexprob.LexProbs

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.