/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.Span;
import joshua.corpus.vocab.SymbolTable;
/**
* Implements common algorithms used with hierarchical phrases.
*
* @author Lane Schwartz
* @version $LastChangedDate: 2010-02-11 15:53:30 -0600 (Thu, 11 Feb 2010) $
*/
public abstract class AbstractHierarchicalPhrases implements
MatchedHierarchicalPhrases {
/** Logger for this class. */
private static final Logger logger =
Logger.getLogger(AbstractHierarchicalPhrases.class.getName());
/**
* Represents a sequence of terminal and nonterminals as
* integer IDs. The pattern is <em>not</em> rooted to a
* location in a corpus.
*/
protected final Pattern pattern;
/**
* Represents the length of each contiguous sequence of
* terminals in the pattern.
* <p>
* To save memory, this information is stored as bytes
* instead of integers.
*
* This means that the maximum value that can be stored
* here is 127. This should not be a problem unless a very
* large value is used for maximum phrase length.
*/
protected final byte[] terminalSequenceLengths;
/**
* Number of hierarchical phrases represented by this object.
*/
protected final int size;
public static int counter = 0;
/**
* Constructs an abstract object representing
* locations in a corpus that match the hierarchical phrase
* represented by the specified pattern.
*
* @param pattern Pattern representing a hierarchical phrase
*/
protected AbstractHierarchicalPhrases(Pattern pattern, int numPhrases) {
this.pattern = pattern;
this.terminalSequenceLengths = pattern.getTerminalSequenceLengths();
this.size = numPhrases;
counter++;
}
/**
* Implements the dotted operators (<̈, =̈, >̈)
* from Lopez (2008), p78-79.
* <p>
* This method behaves as follows when provided prefix
* phrase m_a_alpha and suffix phrase m_alpha_b:
* <ul>
* <li>Returns 0 if m_a_alpha and m_alpha_b can be paired.</li>
* <li>Returns -1 if m_a_alpha and m_alpha_b cannot be
* paired, and m_a_alpha precedes m_alpha_b in the
* corpus.</li>
* <li>Returns 1 if m_a_alpha and m_alpha_b cannot be
* paired, and m_a_alpha follows m_alpha_b in the
* corpus.</li>
* </ul>
*
* @param m_a_alpha List of prefix hierarchical phrases
* @param i Index into m_a_alpha
* @param m_alpha_b List of suffix hierarchical phrases
* @param j Index into m_alpha_b
* @param minNonterminalSpan Minimum allowed nonterminal span
* @param maxPhraseSpan Maximum allowed phrase span
* @return
* <ul>
* <li>0 if m_a_alpha and m_alpha_b can be paired (=̈).</li>
* <li>-1 if m_a_alpha and m_alpha_b cannot be paired, and
* m_a_alpha precedes m_alpha_b in the corpus (<̈).</li>
* <li> 1 if m_a_alpha and m_alpha_b cannot be paired, and
* m_a_alpha follows m_alpha_b in the corpus. (>̈)</li>
* </ul>
*/
protected static int compare(
MatchedHierarchicalPhrases m_a_alpha, final int i,
MatchedHierarchicalPhrases m_alpha_b, final int j,
int minNonterminalSpan, int maxPhraseSpan) {
// Try the cheapest check first: Are they in the same sentence?
{
int m_a_alpha_i_sentenceNumber = m_a_alpha.getSentenceNumber(i);
int m_alpha_b_j_sentenceNumber = m_alpha_b.getSentenceNumber(j);
if (m_a_alpha_i_sentenceNumber < m_alpha_b_j_sentenceNumber) {
return -1;
} else if (m_a_alpha_i_sentenceNumber > m_alpha_b_j_sentenceNumber) {
return 1;
}
}
int prefixStartPosition = m_a_alpha.getStartPosition(i, 0);
int suffixStartPosition = m_alpha_b.getStartPosition(j, 0);
if (prefixStartPosition > suffixStartPosition) {
return 1;
} else if (prefixStartPosition <= suffixStartPosition-maxPhraseSpan) {
return -1;
} else {
// If we get to this point, we know:
//
// * prefix and suffix are in the same sentence
// * prefix occurs before suffix in the sentence
// * prefix and suffix are within maxPhraseSpan of each other
boolean m_a_alpha_endsWithNonterminal = m_a_alpha.endsWithNonterminal();
boolean m_alpha_b_startsWithNonterminal = m_alpha_b.startsWithNonterminal();
// Does the prefix (m_a_alpha) overlap with
// the suffix (m_alpha_b) on any words?
if (m_a_alpha_endsWithNonterminal &&
m_alpha_b_startsWithNonterminal &&
m_a_alpha.arity()==1 &&
m_alpha_b.arity()==1 &&
m_a_alpha.getTerminalSequenceLength(0)==1 &&
m_alpha_b.getTerminalSequenceLength(0)==1) {
return 0;
} else {
int m_a_alphaTerminalSequenceLengths = m_a_alpha.getNumberOfTerminalSequences();//.terminalSequenceLengths.length;
int m_alpha_bTerminalSequenceLengths = m_alpha_b.getNumberOfTerminalSequences();//.terminalSequenceLengths.length;
int m_alpha_b_prefix_start = j*m_alpha_bTerminalSequenceLengths;
int m_alpha_b_prefix_end;
boolean m_a_alpha_startsWithNonterminal = m_a_alpha.startsWithNonterminal();
boolean m_alpha_b_endsWithNonterminal = m_alpha_b.endsWithNonterminal();
// If the m_alpha_b pattern ends with a nonterminal
if (m_alpha_b_endsWithNonterminal ||
// ...or if the m_alpha_b pattern ends with two terminals
m_alpha_b.endsWithTwoTerminals()) {
m_alpha_b_prefix_end = m_alpha_b_prefix_start + m_alpha_bTerminalSequenceLengths;
} else { // Then the m_alpha_b pattern ends with a nonterminal followed by a terminal
m_alpha_b_prefix_end = m_alpha_b_prefix_start + m_alpha_bTerminalSequenceLengths - 1;
}
int m_a_alpha_suffix_start;
int m_a_alpha_suffix_end;
boolean increment_m_a_alpha_suffix_start;
int m_a_alphaExtra;
// If the m_a_alpha pattern starts with a nonterminal
if (m_a_alpha_startsWithNonterminal) {
m_a_alphaExtra = 0;
m_a_alpha_suffix_start = i*m_a_alphaTerminalSequenceLengths;
m_a_alpha_suffix_end = m_a_alpha_suffix_start + m_a_alphaTerminalSequenceLengths;
increment_m_a_alpha_suffix_start = false;
} else if (m_a_alpha.secondTokenIsTerminal()) {
// Then the m_a_alpha pattern starts with two terminals
m_a_alphaExtra = 0;
m_a_alpha_suffix_start = i*m_a_alphaTerminalSequenceLengths;
m_a_alpha_suffix_end = m_a_alpha_suffix_start + m_a_alphaTerminalSequenceLengths;
increment_m_a_alpha_suffix_start = true;
} else {
// Then the m_a_alpha pattern starts with a terminal followed by a nonterminal
m_a_alphaExtra = 1;
m_a_alpha_suffix_start = i*m_a_alphaTerminalSequenceLengths + 1;
m_a_alpha_suffix_end = i*m_a_alphaTerminalSequenceLengths + m_a_alphaTerminalSequenceLengths;
increment_m_a_alpha_suffix_start = false;
}
int m_a_alpha_suffix_length = m_a_alpha_suffix_end - m_a_alpha_suffix_start;
int m_alpha_b_prefix_length = m_alpha_b_prefix_end - m_alpha_b_prefix_start;
if (m_alpha_b_prefix_length != m_a_alpha_suffix_length) {
throw new MismatchedHierarchicalPhrasesException();
} else {
int result = 0;
for (int index=0; index<m_a_alpha_suffix_length; index++) {
int a = m_a_alpha.getStartPosition(i, index+m_a_alphaExtra);
if (increment_m_a_alpha_suffix_start && index==0) {
a++;
}
int b = m_alpha_b.getStartPosition(j, index);
if (a > b) {
result = 1;
break;
} else if (a < b) {
result = -1;
break;
}
}
if (result==0) {
int positionNumber = m_alpha_bTerminalSequenceLengths-1;
int length = m_alpha_b.getStartPosition(j, positionNumber) + m_alpha_b.getTerminalSequenceLength(positionNumber) - prefixStartPosition;
if (m_alpha_b_endsWithNonterminal)
length += minNonterminalSpan;
if (m_a_alpha_startsWithNonterminal)
length += minNonterminalSpan;
if (length > maxPhraseSpan) {
result = -1;
}
}
return result;
}
}
}
}
/**
* Constructs the data to represent the hierarchical phrase,
* formed by intersecting the <code>i<code>th phrase of
* <code>M_a_alpha</code> with the <code>j<code>th phrase
* of <code>M_alpha_b</code> and appends this new data to
* the <code>data</code> list.
*
* @param M_a_alpha List of prefix hierarchical phrases
* @param i Index into M_a_alpha
* @param M_alpha_b List of suffix hierarchical phrases
* @param j Index into M_alpha_b
* @param list List where new data will be added
*/
protected static void partiallyConstruct(
MatchedHierarchicalPhrases M_a_alpha, int i,
MatchedHierarchicalPhrases M_alpha_b, int j,
List<Integer> list) {
boolean prefixEndsWithNonterminal = M_a_alpha.endsWithNonterminal();
// Get all start positions for the prefix phrase, and append them to the running list
{
int numTerminalSequences = M_a_alpha.getNumberOfTerminalSequences();
for (int index=0; index<numTerminalSequences; index++) {
list.add(M_a_alpha.getStartPosition(i, index));
}
}
if (prefixEndsWithNonterminal) {
// Get the final start positions for the suffix phrase, and append it to the running list
int index = M_alpha_b.getNumberOfTerminalSequences() - 1;
list.add(M_alpha_b.getStartPosition(j, index));
}
}
/**
* Implements the <tt>QUERY_INTERSECT</tt> algorithm from
* Adam Lopez's thesis (Lopez 2008). This implementation
* follows a corrected algorithm (Lopez, personal communication).
*
* @param pattern Pattern which will be associated with the new list
* of matched hierarchical phrases
* @param M_a_alpha Prefix list of matched hierarchical phrases
* @param M_alpha_b Suffix list of matched hierarchical phrases
* @param minNonterminalSpan Minimum allowed span for a nonterminal
* @param maxPhraseSpan Maximum allowed phrase span
* @return The list of matched hierarchical phrases resulting from
* the intersection of the two provided lists
* of matched hierarchical phrases
*/
public static MatchedHierarchicalPhrases queryIntersect(Pattern pattern,
MatchedHierarchicalPhrases M_a_alpha,
MatchedHierarchicalPhrases M_alpha_b,
int minNonterminalSpan, int maxPhraseSpan, Suffixes sourceSuffixArray) {
if (logger.isLoggable(Level.FINER)) {
logger.finer("queryIntersect("+pattern+" M_a_alpha.size=="+M_a_alpha.size() + ", M_alpha_b.size=="+M_alpha_b.size());
}
if (sourceSuffixArray!=null && sourceSuffixArray.getCachedHierarchicalPhrases().containsKey(pattern)) {
return sourceSuffixArray.getCachedHierarchicalPhrases().get(pattern);
} else {
// results is M_{a_alpha_b} in the paper
ArrayList<Integer> data = new ArrayList<Integer>();
ArrayList<Integer> sentenceNumbers = new ArrayList<Integer>();
int I = M_a_alpha.size();
int J = M_alpha_b.size();
int i = 0;
int j = 0;
while (i<I && j<J) {
while (j<J && compare(M_a_alpha, i, M_alpha_b, j, minNonterminalSpan, maxPhraseSpan) > 0) {
j++; // advance j past no longer needed item in M_alpha_b
}
int k = i;
// Process all matchings in M_a_alpha with same first element
int kth_startPosition = M_a_alpha.getStartPosition(k, 0);
while (i<I && M_a_alpha.getStartPosition(i, 0) == kth_startPosition) {
int l = j;
// While not M_a_alpha[i] <̈ M_alpha_b[l]
if (l < J) {
int comparison = compare(M_a_alpha, i, M_alpha_b, l, minNonterminalSpan, maxPhraseSpan);
while (l < J && !(comparison < 0)) {
// If M_a_alpha[i] =̈ M_alpha_b[l]
if (comparison == 0) {
// Append M_a_alpha[i] |><| M_alpha_b[l] to M_a_alpha_b
partiallyConstruct(M_a_alpha, i, M_alpha_b, l, data);
sentenceNumbers.add(M_a_alpha.getSentenceNumber(i));
} // end if
// We can visit m_alpha_b[l] again, but only next time through outermost loop
l = l + 1;
if (l < J) {
comparison = compare(M_a_alpha, i, M_alpha_b, l, minNonterminalSpan, maxPhraseSpan);
}
} // end while
} // end if
// advance i past no longer needed item in M_a_alpha
i = i + 1;
} // end while
} // end while
// if (sourceSuffixArray==null) {
return new HierarchicalPhrases(pattern, data, sentenceNumbers);
// } else {
// int[] startPositions = new int[data.size()];
// for (int index=0, n=data.size(); index<n; index++) {
// startPositions[index] = data.get(index);
// }
//
// return sourceSuffixArray.createHierarchicalPhrases(startPositions, pattern, sourceSuffixArray.getVocabulary());
// }
}
}
/* See Javadoc for MatchedHierarchicalPhrase interface. */
public int getTerminalSequenceLength(int i) {
return terminalSequenceLengths[i];
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public int getNumberOfTerminalSequences() {
return terminalSequenceLengths.length;
}
/* See Javadoc for PatternFormat interface. */
public boolean endsWithNonterminal() {
return pattern.endsWithNonterminal();
}
/* See Javadoc for PatternFormat interface. */
public boolean startsWithNonterminal() {
return pattern.startsWithNonterminal();
}
/* See Javadoc for PatternFormat interface. */
public boolean endsWithTwoTerminals() {
return pattern.endsWithTwoTerminals();
}
/* See Javadoc for PatternFormat interface. */
public boolean secondTokenIsTerminal() {
return pattern.secondTokenIsTerminal();
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public int getEndPosition(int phraseIndex, int positionNumber) {
return getStartPosition(phraseIndex, positionNumber) + getTerminalSequenceLength(positionNumber);
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public int getTerminalSequenceStartIndex(int phraseIndex, int sequenceIndex) {
// int n = terminalSequenceLengths.length;
// int nthPhraseIndex = phraseIndex*n;
int start = this.getStartPosition(phraseIndex, sequenceIndex);//this.terminalSequenceStartIndices[nthPhraseIndex+sequenceIndex];
return start;
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public int getTerminalSequenceEndIndex(int phraseIndex, int sequenceIndex) {
// int n = terminalSequenceLengths.length;
// int nthPhraseIndex = phraseIndex*n;
int start = this.getStartPosition(phraseIndex, sequenceIndex);//this.terminalSequenceStartIndices[nthPhraseIndex+sequenceIndex];
int end = start + this.terminalSequenceLengths[sequenceIndex];
return end;
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public int getFirstTerminalIndex(int phraseIndex) {
// int n = terminalSequenceLengths.length;
// int nthPhraseIndex = phraseIndex*n;
int index = 0;
int start = this.getStartPosition(phraseIndex, index);//this.terminalSequenceStartIndices[nthPhraseIndex+index];
return start;
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public boolean containsTerminalAt(int phraseIndex,
int alignedPointIndex) {
int n = terminalSequenceLengths.length;
// int nthPhraseIndex = phraseIndex*n;
for (int index=0; index<n; index++) {
int start = this.getStartPosition(phraseIndex, index);//this.terminalSequenceStartIndices[nthPhraseIndex+index];
if (alignedPointIndex >= start &&
alignedPointIndex < start + this.terminalSequenceLengths[index]) {
return true;
}
}
return false;
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public int getLastTerminalIndex(int phraseIndex) {
int n = terminalSequenceLengths.length;
int index = n-1;
int start = getStartPosition(phraseIndex, index);
int end = start + this.terminalSequenceLengths[n-1];
return end;
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public Span getSpan(int phraseIndex) {
int n = terminalSequenceLengths.length;
// int nthPhraseIndex = phraseIndex*n;
int lastIndex = n-1;
int start = this.getStartPosition(phraseIndex, 0);//this.terminalSequenceStartIndices[nthPhraseIndex+0];
int lastStart = this.getStartPosition(phraseIndex, lastIndex);//this.terminalSequenceStartIndices[nthPhraseIndex+lastIndex];
int lastLength = this.terminalSequenceLengths[lastIndex];
int end = lastStart + lastLength;
return new Span(start, end);
}
/**
* Gets the number of nonterminals in this object's pattern.
*
* @return the number of nonterminals
*/
public int arity() {
return pattern.arity;
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public Pattern getPattern() {
return this.pattern;
}
/* See Javadoc for MatchedHierarchicalPhrases interface. */
public int size() {
return size;
}
public boolean equals(Object o) {
if (o instanceof AbstractHierarchicalPhrases) {
AbstractHierarchicalPhrases other = (AbstractHierarchicalPhrases) o;
if (this.getPattern().equals(other.getPattern())
&& this.size()==other.size()
&& this.arity()==other.arity()
&& this.getNumberOfTerminalSequences() == other.getNumberOfTerminalSequences()
&& this.endsWithNonterminal()==other.endsWithNonterminal()
&& this.startsWithNonterminal()==other.startsWithNonterminal()
&& this.endsWithTwoTerminals()==other.endsWithTwoTerminals()
&& this.secondTokenIsTerminal()==other.secondTokenIsTerminal()) {
int n = getNumberOfTerminalSequences();
for (int i=0, size=this.size(); i<size; i++) {
for (int seq=0; seq<n; seq++) {
if (this.getStartPosition(i, seq) != other.getStartPosition(i, seq) ||
this.getEndPosition(i, seq) != other.getEndPosition(i, seq)) {
return false;
}
}
}
return true;
} else {
return false;
}
} else {
return false;
}
}
protected static Pattern getPatternWithInitialX(Pattern pattern) {
int[] xwords = new int[pattern.words.length+1];
xwords[0] = SymbolTable.X;
for (int i=0; i<pattern.words.length; i++) {
xwords[i+1] = pattern.words[i];
}
return new Pattern(pattern.vocab, xwords);
}
protected Pattern getPatternWithInitialX() {
return getPatternWithInitialX(pattern);
}
protected Pattern getPatternWithFinalX() {
return new Pattern(pattern.vocab, pattern.words, SymbolTable.X);
}
}