Package joshua.corpus.suffix_array

Source Code of joshua.corpus.suffix_array.AbstractHierarchicalPhrases

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;

import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.Span;
import joshua.corpus.vocab.SymbolTable;

/**
* Implements common algorithms used with hierarchical phrases.
*
* @author Lane Schwartz
* @version $LastChangedDate: 2010-02-11 15:53:30 -0600 (Thu, 11 Feb 2010) $
*/
public abstract class AbstractHierarchicalPhrases implements
    MatchedHierarchicalPhrases {

  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(AbstractHierarchicalPhrases.class.getName());
 
  /**
   * Represents a sequence of terminal and nonterminals as
   * integer IDs. The pattern is <em>not</em> rooted to a
   * location in a corpus.
   */
  protected final Pattern pattern;
 
  /**
   * Represents the length of each contiguous sequence of
   * terminals in the pattern.
   * <p>
   * To save memory, this information is stored as bytes
   * instead of integers.
   *
   * This means that the maximum value that can be stored
   * here is 127. This should not be a problem unless a very
   * large value is used for maximum phrase length.
   */
  protected final byte[] terminalSequenceLengths;
 
 
  /**
   * Number of hierarchical phrases represented by this object.
   */
  protected final int size;
 
  public static int counter = 0;
 
  /**
   * Constructs an abstract object representing
   * locations in a corpus that match the hierarchical phrase
   * represented by the specified pattern.
   *
   * @param pattern Pattern representing a hierarchical phrase
   */
  protected AbstractHierarchicalPhrases(Pattern pattern, int numPhrases) {
    this.pattern = pattern;
    this.terminalSequenceLengths = pattern.getTerminalSequenceLengths();
    this.size = numPhrases;
    counter++;
  }
   
  /**
   * Implements the dotted operators (<̈, =̈, >̈)
   * from Lopez (2008), p78-79.
   * <p>
   * This method behaves as follows when provided prefix
   * phrase m_a_alpha and suffix phrase m_alpha_b:
   * <ul>
   * <li>Returns 0 if m_a_alpha and m_alpha_b can be paired.</li>
   * <li>Returns -1 if m_a_alpha and m_alpha_b cannot be
   *     paired, and m_a_alpha precedes m_alpha_b in the
   *     corpus.</li>
   * <li>Returns  1 if m_a_alpha and m_alpha_b cannot be
   *     paired, and m_a_alpha follows m_alpha_b in the
   *     corpus.</li>
   * </ul>
   *
     * @param m_a_alpha List of prefix hierarchical phrases
   * @param i Index into m_a_alpha
   * @param m_alpha_b List of suffix hierarchical phrases
   * @param j Index into m_alpha_b
   * @param minNonterminalSpan Minimum allowed nonterminal span
   * @param maxPhraseSpan Maximum allowed phrase span
   * @return
   * <ul>
   * <li>0 if m_a_alpha and m_alpha_b can be paired (=̈).</li>
   * <li>-1 if m_a_alpha and m_alpha_b cannot be paired, and
   *     m_a_alpha precedes m_alpha_b in the corpus (<̈).</li>
   * <li> 1 if m_a_alpha and m_alpha_b cannot be paired, and
   *     m_a_alpha follows m_alpha_b in the corpus. (>̈)</li>
   * </ul>
   */ 
  protected static int compare(
      MatchedHierarchicalPhrases m_a_alpha, final int i,
      MatchedHierarchicalPhrases m_alpha_b, final int j,
      int minNonterminalSpan, int maxPhraseSpan) {
 
    // Try the cheapest check first: Are they in the same sentence?
    {
      int m_a_alpha_i_sentenceNumber = m_a_alpha.getSentenceNumber(i);
      int m_alpha_b_j_sentenceNumber = m_alpha_b.getSentenceNumber(j);

      if (m_a_alpha_i_sentenceNumber < m_alpha_b_j_sentenceNumber) {
        return -1;
      } else if (m_a_alpha_i_sentenceNumber > m_alpha_b_j_sentenceNumber) {
        return 1;
      }
    }

   
    int prefixStartPosition = m_a_alpha.getStartPosition(i, 0);
    int suffixStartPosition = m_alpha_b.getStartPosition(j, 0);
   
    if (prefixStartPosition > suffixStartPosition) {
      return 1;
    } else if (prefixStartPosition <= suffixStartPosition-maxPhraseSpan) {
      return -1;
    } else {

      // If we get to this point, we know:
      //
      // * prefix and suffix are in the same sentence
      // * prefix occurs before suffix in the sentence
      // * prefix and suffix are within maxPhraseSpan of each other
     
      boolean m_a_alpha_endsWithNonterminal = m_a_alpha.endsWithNonterminal();
      boolean m_alpha_b_startsWithNonterminal = m_alpha_b.startsWithNonterminal();
     
      // Does the prefix (m_a_alpha) overlap with
      //      the suffix (m_alpha_b) on any words?
      if (m_a_alpha_endsWithNonterminal &&
          m_alpha_b_startsWithNonterminal &&
          m_a_alpha.arity()==1 &&
          m_alpha_b.arity()==1 &&
          m_a_alpha.getTerminalSequenceLength(0)==1 &&
          m_alpha_b.getTerminalSequenceLength(0)==1) {
       
        return 0;
     
      } else {
       
        int m_a_alphaTerminalSequenceLengths = m_a_alpha.getNumberOfTerminalSequences();//.terminalSequenceLengths.length;
        int m_alpha_bTerminalSequenceLengths = m_alpha_b.getNumberOfTerminalSequences();//.terminalSequenceLengths.length;
       
        int m_alpha_b_prefix_start = j*m_alpha_bTerminalSequenceLengths;
        int m_alpha_b_prefix_end;

        boolean m_a_alpha_startsWithNonterminal = m_a_alpha.startsWithNonterminal();
        boolean m_alpha_b_endsWithNonterminal = m_alpha_b.endsWithNonterminal();   
       
        // If the m_alpha_b pattern ends with a nonterminal
        if (m_alpha_b_endsWithNonterminal ||
            // ...or if the m_alpha_b pattern ends with two terminals
            m_alpha_b.endsWithTwoTerminals()) {

          m_alpha_b_prefix_end = m_alpha_b_prefix_start + m_alpha_bTerminalSequenceLengths;

        } else { // Then the m_alpha_b pattern ends with a nonterminal followed by a terminal

          m_alpha_b_prefix_end = m_alpha_b_prefix_start + m_alpha_bTerminalSequenceLengths - 1;

        }

        int m_a_alpha_suffix_start;
        int m_a_alpha_suffix_end;
        boolean increment_m_a_alpha_suffix_start;

        int m_a_alphaExtra;

        // If the m_a_alpha pattern starts with a nonterminal
        if (m_a_alpha_startsWithNonterminal) {
          m_a_alphaExtra = 0;
          m_a_alpha_suffix_start = i*m_a_alphaTerminalSequenceLengths;
          m_a_alpha_suffix_end = m_a_alpha_suffix_start + m_a_alphaTerminalSequenceLengths;
          increment_m_a_alpha_suffix_start = false;
        } else if (m_a_alpha.secondTokenIsTerminal()) {
          // Then the m_a_alpha pattern starts with two terminals
          m_a_alphaExtra = 0;
          m_a_alpha_suffix_start = i*m_a_alphaTerminalSequenceLengths;
          m_a_alpha_suffix_end = m_a_alpha_suffix_start + m_a_alphaTerminalSequenceLengths;

          increment_m_a_alpha_suffix_start = true;
        } else {
          // Then the m_a_alpha pattern starts with a terminal followed by a nonterminal
          m_a_alphaExtra = 1;
          m_a_alpha_suffix_start = i*m_a_alphaTerminalSequenceLengths + 1;
          m_a_alpha_suffix_end = i*m_a_alphaTerminalSequenceLengths + m_a_alphaTerminalSequenceLengths;

          increment_m_a_alpha_suffix_start = false;
        }

        int m_a_alpha_suffix_length = m_a_alpha_suffix_end - m_a_alpha_suffix_start;
        int m_alpha_b_prefix_length = m_alpha_b_prefix_end - m_alpha_b_prefix_start;

        if (m_alpha_b_prefix_length != m_a_alpha_suffix_length) {
          throw new MismatchedHierarchicalPhrasesException();
        } else {

          int result = 0;

          for (int index=0; index<m_a_alpha_suffix_length; index++) {

            int a = m_a_alpha.getStartPosition(i, index+m_a_alphaExtra);
            if (increment_m_a_alpha_suffix_start && index==0) {
              a++;
            }
            int b = m_alpha_b.getStartPosition(j, index);

            if (a > b) {
              result = 1;
              break;
            } else if (a < b) {
              result = -1;
              break;
            }
          }

          if (result==0) {
            int positionNumber = m_alpha_bTerminalSequenceLengths-1;
            int length = m_alpha_b.getStartPosition(j, positionNumber) + m_alpha_b.getTerminalSequenceLength(positionNumber) - prefixStartPosition;

            if (m_alpha_b_endsWithNonterminal)
              length += minNonterminalSpan;
            if (m_a_alpha_startsWithNonterminal)
              length += minNonterminalSpan;

            if (length > maxPhraseSpan) {
              result = -1;
            }
          }

          return result;
        }

      }
    }
  }
 
  /**
   * Constructs the data to represent the hierarchical phrase,
   * formed by intersecting the <code>i<code>th phrase of
   * <code>M_a_alpha</code> with the <code>j<code>th phrase
   * of <code>M_alpha_b</code> and appends this new data to
   * the <code>data</code> list.
   *
   * @param M_a_alpha List of prefix hierarchical phrases
   * @param i Index into M_a_alpha
   * @param M_alpha_b List of suffix hierarchical phrases
   * @param j Index into M_alpha_b
   * @param list List where new data will be added
   */
  protected static void partiallyConstruct(
      MatchedHierarchicalPhrases M_a_alpha, int i,
      MatchedHierarchicalPhrases M_alpha_b, int j,
      List<Integer> list) {
   
   
    boolean prefixEndsWithNonterminal = M_a_alpha.endsWithNonterminal();
   
    // Get all start positions for the prefix phrase, and append them to the running list
    {
      int numTerminalSequences = M_a_alpha.getNumberOfTerminalSequences();
     
      for (int index=0; index<numTerminalSequences; index++) {
        list.add(M_a_alpha.getStartPosition(i, index));
      }
     
    }
   
   
    if (prefixEndsWithNonterminal) {
      // Get the final start positions for the suffix phrase, and append it to the running list
      int index = M_alpha_b.getNumberOfTerminalSequences() - 1;
      list.add(M_alpha_b.getStartPosition(j, index));
    }
   
  }

  /**
   * Implements the <tt>QUERY_INTERSECT</tt> algorithm from
   * Adam Lopez's thesis (Lopez 2008). This implementation
   * follows a corrected algorithm (Lopez, personal communication).
   *
   * @param pattern Pattern which will be associated with the new list
   *                of matched hierarchical phrases
   * @param M_a_alpha Prefix list of matched hierarchical phrases
   * @param M_alpha_b Suffix list of matched hierarchical phrases
   * @param minNonterminalSpan Minimum allowed span for a nonterminal
   * @param maxPhraseSpan Maximum allowed phrase span
   * @return The list of matched hierarchical phrases resulting from
   *         the intersection of the two provided lists
   *         of matched hierarchical phrases
   */
  public static MatchedHierarchicalPhrases queryIntersect(Pattern pattern,
      MatchedHierarchicalPhrases M_a_alpha,
      MatchedHierarchicalPhrases M_alpha_b,
      int minNonterminalSpan, int maxPhraseSpan, Suffixes sourceSuffixArray) {

    if (logger.isLoggable(Level.FINER)) {
      logger.finer("queryIntersect("+pattern+" M_a_alpha.size=="+M_a_alpha.size() + ", M_alpha_b.size=="+M_alpha_b.size());     
    }
   
    if (sourceSuffixArray!=null && sourceSuffixArray.getCachedHierarchicalPhrases().containsKey(pattern)) {
      return sourceSuffixArray.getCachedHierarchicalPhrases().get(pattern);
    } else {

      // results is M_{a_alpha_b} in the paper
      ArrayList<Integer> data = new ArrayList<Integer>();
      ArrayList<Integer> sentenceNumbers = new ArrayList<Integer>();

      int I = M_a_alpha.size();
      int J = M_alpha_b.size();

      int i = 0;
      int j = 0;

      while (i<I && j<J) {

        while (j<J && compare(M_a_alpha, i, M_alpha_b, j, minNonterminalSpan, maxPhraseSpan) > 0) {
          j++; // advance j past no longer needed item in M_alpha_b
        }

        int k = i;         

        // Process all matchings in M_a_alpha with same first element
        int kth_startPosition = M_a_alpha.getStartPosition(k, 0);
        while (i<I && M_a_alpha.getStartPosition(i, 0) == kth_startPosition) {

          int l = j;

          // While not M_a_alpha[i] <̈ M_alpha_b[l]
          if (l < J) {
            int comparison = compare(M_a_alpha, i, M_alpha_b, l,  minNonterminalSpan, maxPhraseSpan);
            while (l < J && !(comparison < 0)) {

              // If M_a_alpha[i] =̈ M_alpha_b[l]
              if (comparison == 0) {

                // Append M_a_alpha[i] |><| M_alpha_b[l] to M_a_alpha_b
                partiallyConstruct(M_a_alpha, i, M_alpha_b, l, data);
                sentenceNumbers.add(M_a_alpha.getSentenceNumber(i));

              } // end if

              // We can visit m_alpha_b[l] again, but only next time through outermost loop
              l = l + 1;
              if (l < J) {
                comparison = compare(M_a_alpha, i, M_alpha_b, l,  minNonterminalSpan, maxPhraseSpan);
              }

            } // end while
          } // end if

          // advance i past no longer needed item in M_a_alpha
          i = i + 1;

        } // end while

      } // end while

      //    if (sourceSuffixArray==null) {
      return new HierarchicalPhrases(pattern, data, sentenceNumbers);
      //    } else {
      //      int[] startPositions = new int[data.size()];
      //      for (int index=0, n=data.size(); index<n; index++) {
      //        startPositions[index] = data.get(index);
      //      }
      //     
      //      return sourceSuffixArray.createHierarchicalPhrases(startPositions, pattern, sourceSuffixArray.getVocabulary());     
      //    }

    }   
  }

  /* See Javadoc for MatchedHierarchicalPhrase interface. */
  public int getTerminalSequenceLength(int i) {
    return terminalSequenceLengths[i];
  }
 
  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public int getNumberOfTerminalSequences() {
    return terminalSequenceLengths.length;
  }
 
  /* See Javadoc for PatternFormat interface. */
  public boolean endsWithNonterminal() {
    return pattern.endsWithNonterminal();
  }
 
  /* See Javadoc for PatternFormat interface. */
  public boolean startsWithNonterminal() {
    return pattern.startsWithNonterminal();
  }
 
  /* See Javadoc for PatternFormat interface. */
  public boolean endsWithTwoTerminals() {
    return pattern.endsWithTwoTerminals();
  }
 
  /* See Javadoc for PatternFormat interface. */
  public boolean secondTokenIsTerminal() {
    return pattern.secondTokenIsTerminal();
  }
 
  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public int getEndPosition(int phraseIndex, int positionNumber) {
   
    return getStartPosition(phraseIndex, positionNumber) + getTerminalSequenceLength(positionNumber);
       
  }
 
  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public int getTerminalSequenceStartIndex(int phraseIndex, int sequenceIndex) {
//    int n = terminalSequenceLengths.length;
//    int nthPhraseIndex = phraseIndex*n;
   
    int start = this.getStartPosition(phraseIndex, sequenceIndex);//this.terminalSequenceStartIndices[nthPhraseIndex+sequenceIndex];
    return start;
  }
 
  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public int getTerminalSequenceEndIndex(int phraseIndex, int sequenceIndex) {
//    int n = terminalSequenceLengths.length;
//    int nthPhraseIndex = phraseIndex*n;
   
    int start = this.getStartPosition(phraseIndex, sequenceIndex);//this.terminalSequenceStartIndices[nthPhraseIndex+sequenceIndex];
    int end = start + this.terminalSequenceLengths[sequenceIndex];
   
    return end;
  }
 

  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public int getFirstTerminalIndex(int phraseIndex) {
//    int n = terminalSequenceLengths.length;
//    int nthPhraseIndex = phraseIndex*n;
    int index = 0;
   
    int start = this.getStartPosition(phraseIndex, index);//this.terminalSequenceStartIndices[nthPhraseIndex+index];
    return start;
  }
 
 
 
  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public boolean containsTerminalAt(int phraseIndex,
      int alignedPointIndex) {
   
    int n = terminalSequenceLengths.length;
//    int nthPhraseIndex = phraseIndex*n;
   
    for (int index=0; index<n; index++) {
      int start = this.getStartPosition(phraseIndex, index);//this.terminalSequenceStartIndices[nthPhraseIndex+index];
      if (alignedPointIndex >= start &&
          alignedPointIndex < start + this.terminalSequenceLengths[index]) {
        return true;
      }
    }   
   
    return false;

  }
 
  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public int getLastTerminalIndex(int phraseIndex) {
    int n = terminalSequenceLengths.length;
    int index = n-1;
   
    int start = getStartPosition(phraseIndex, index);
    int end = start + this.terminalSequenceLengths[n-1];
   
    return end;
   
  }
 
 
  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public Span getSpan(int phraseIndex) {
   
    int n = terminalSequenceLengths.length;
//    int nthPhraseIndex = phraseIndex*n;
   
    int lastIndex = n-1;
   
    int start = this.getStartPosition(phraseIndex, 0);//this.terminalSequenceStartIndices[nthPhraseIndex+0];
    int lastStart = this.getStartPosition(phraseIndex, lastIndex);//this.terminalSequenceStartIndices[nthPhraseIndex+lastIndex];
    int lastLength = this.terminalSequenceLengths[lastIndex];
    int end = lastStart + lastLength;   
   
    return new Span(start, end);
  }
 
  /**
   * Gets the number of nonterminals in this object's pattern.
   *
   * @return the number of nonterminals
   */
  public int arity() {
    return pattern.arity;
  }
 
  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public Pattern getPattern() {
    return this.pattern;
  }

  /* See Javadoc for MatchedHierarchicalPhrases interface. */
  public int size() {
    return size;
  }
 
  public boolean equals(Object o) {
    if (o instanceof AbstractHierarchicalPhrases) {
      AbstractHierarchicalPhrases other = (AbstractHierarchicalPhrases) o;
     
      if (this.getPattern().equals(other.getPattern())
          && this.size()==other.size()
          && this.arity()==other.arity()
          && this.getNumberOfTerminalSequences() == other.getNumberOfTerminalSequences()
          && this.endsWithNonterminal()==other.endsWithNonterminal()
          && this.startsWithNonterminal()==other.startsWithNonterminal()
          && this.endsWithTwoTerminals()==other.endsWithTwoTerminals()
          && this.secondTokenIsTerminal()==other.secondTokenIsTerminal()) {
     
        int n = getNumberOfTerminalSequences();
        for (int i=0, size=this.size(); i<size; i++) {
          for (int seq=0; seq<n; seq++) {
            if (this.getStartPosition(i, seq) != other.getStartPosition(i, seq) ||
                this.getEndPosition(i, seq) != other.getEndPosition(i, seq)) {
              return false;
            }
          }
        }
       
        return true;
      } else {
        return false;
      }
     
    } else {
      return false;
    }
  }
 
 
  protected static Pattern getPatternWithInitialX(Pattern pattern) {
    int[] xwords = new int[pattern.words.length+1];
    xwords[0] = SymbolTable.X;
    for (int i=0; i<pattern.words.length; i++) {
      xwords[i+1] = pattern.words[i];
    }
    return new Pattern(pattern.vocab, xwords);
  }
 
  protected Pattern getPatternWithInitialX() {
    return getPatternWithInitialX(pattern);
  }
 
  protected Pattern getPatternWithFinalX() {
    return new Pattern(pattern.vocab, pattern.words, SymbolTable.X);
  }
}
TOP

Related Classes of joshua.corpus.suffix_array.AbstractHierarchicalPhrases

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.