/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.alignment;
import joshua.corpus.Corpus;
import joshua.corpus.Span;
/**
* Abstract implementation of <code>Alignments</code> interface
* that includes code likely to be common to implementations which
* conceptually view alignment points as a grid.
* <p>
* This class class implements all methods defined by the
* <code>Alignments</code> interface except for {@link #size()}.
*
* Any concrete child class need only implement that method and
* the two abstract protected methods defined here.
*
* @author Lane Schwartz
*/
public abstract class AbstractAlignmentGrids extends AbstractAlignments {
/** Source language corpus. */
protected final Corpus sourceCorpus;
/** Target language corpus. */
protected final Corpus targetCorpus;
/**
* Constructs an abstract alignments grid.
*
* @param sourceCorpus Source language corpus
* @param targetCorpus Target language corpus
* @param requireTightSpans Indicates whether tight spans
* are required during phrase extraction
*/
public AbstractAlignmentGrids(Corpus sourceCorpus, Corpus targetCorpus, boolean requireTightSpans) {
super(requireTightSpans);
this.sourceCorpus = sourceCorpus;
this.targetCorpus = targetCorpus;
}
/**
* Gets the indices of all source words aligned to the
* specified span in the specified sentence.
* <p>
* All indices in this method are zero-based.
* <p>
* The span parameters of this method are relative to the
* sentene. So, for example, calling this method to get the
* source indices for a target span covering the first three
* words of the eight sentence in the parallel corpus, the
* following parameter values would be used:
*
* <code>getSourcePoints(7, 0, 3)</code>
*
* @param sentenceID Index of a sentence in the aligned parallel corpus
* @param targetSpanStart Inclusive start index in the target sentence
* @param targetSpanEnd Exclusive end index in the target sentence
* @return the indices of all source words aligned to the
* specified span in the specified sentence
*/
protected abstract int[] getSourcePoints(int sentenceID, int targetSpanStart, int targetSpanEnd);
/**
* Gets the indices of all target words aligned to the
* specified span in the specified sentence.
* <p>
* All indices in this method are zero-based.
* <p>
* The span parameters of this method are relative to the
* sentence. So, for example, calling this method to get
* the target indices for a source span covering the first
* three words of the eight sentence in the parallel corpus,
* the following parameter values would be used:
*
* <code>getSourcePoints(7, 0, 3)</code>
*
* @param sentenceID Index of a sentence in the aligned parallel corpus
* @param sourceSpanStart Inclusive start index in the source sentence
* @param sourceSpanEnd Exclusive end index in the source sentence
* @return the indices of all target words aligned to the
* specified span in the specified sentence
*/
protected abstract int[] getTargetPoints(int sentenceID, int sourceSpanStart, int sourceSpanEnd);
/* See Javadoc for Alignments interface. */
public int[] getAlignedSourceIndices(int targetIndex) {
int sentenceID = targetCorpus.getSentenceIndex(targetIndex);
int sourceOffset = sourceCorpus.getSentencePosition(sentenceID);
int targetOffset = targetCorpus.getSentencePosition(sentenceID);
int normalizedTargetIndex = targetIndex - targetOffset;
int[] sourceIndices = getSourcePoints(sentenceID, normalizedTargetIndex, normalizedTargetIndex+1);
for (int i=0; i<sourceIndices.length; i++) {
sourceIndices[i] += sourceOffset;
}
if (sourceIndices.length==0) {
return null;
} else {
return sourceIndices;
}
}
/* See Javadoc for Alignments interface. */
public Span getAlignedSourceSpan(int startTargetIndex, int endTargetIndex) {
int sentenceID = targetCorpus.getSentenceIndex(startTargetIndex);
int sourceOffset = sourceCorpus.getSentencePosition(sentenceID);
int targetOffset = targetCorpus.getSentencePosition(sentenceID);
int normalizedTargetStartIndex = startTargetIndex - targetOffset;
int normalizedTargetEndIndex = endTargetIndex - targetOffset;
int[] sourceIndices = getSourcePoints(sentenceID, normalizedTargetStartIndex, normalizedTargetEndIndex);
if (sourceIndices==null || sourceIndices.length==0) {
return new Span(UNALIGNED, UNALIGNED);
} else {
int startSourceIndex = sourceOffset + sourceIndices[0];
int endSourceIndex = sourceOffset + sourceIndices[sourceIndices.length-1]+1;
return new Span(startSourceIndex, endSourceIndex);
}
}
/* See Javadoc for Alignments interface. */
public int[] getAlignedTargetIndices(int sourceIndex) {
int sentenceID = sourceCorpus.getSentenceIndex(sourceIndex);
int targetOffset = targetCorpus.getSentencePosition(sentenceID);
int sourceOffset = sourceCorpus.getSentencePosition(sentenceID);
int normalizedSourceIndex = sourceIndex - sourceOffset;
int[] targetIndices = getTargetPoints(sentenceID, normalizedSourceIndex, normalizedSourceIndex+1);
for (int i=0; i<targetIndices.length; i++) {
targetIndices[i] += targetOffset;
}
if (targetIndices.length==0) {
return null;
} else {
return targetIndices;
}
}
/* See Javadoc for Alignments interface. */
public Span getAlignedTargetSpan(int startSourceIndex, int endSourceIndex) {
int sentenceID = sourceCorpus.getSentenceIndex(startSourceIndex);
int targetOffset = targetCorpus.getSentencePosition(sentenceID);
int sourceOffset = sourceCorpus.getSentencePosition(sentenceID);
int normalizedSourceStartIndex = startSourceIndex - sourceOffset;
int normalizedSourceEndIndex = endSourceIndex - sourceOffset;
int[] targetIndices = getTargetPoints(sentenceID, normalizedSourceStartIndex, normalizedSourceEndIndex);
int[] startPoints = getTargetPoints(sentenceID, normalizedSourceStartIndex, normalizedSourceStartIndex+1);
int[] endPoints = getTargetPoints(sentenceID, normalizedSourceEndIndex-1, normalizedSourceEndIndex);
if (targetIndices==null || targetIndices.length==0 || (requireTightSpans && (
startPoints==null || startPoints.length==0 ||
endPoints==null || endPoints.length==0))) {
return new Span(UNALIGNED, UNALIGNED);
} else {
int startTargetIndex = targetOffset + targetIndices[0];
int endTargetIndex = targetOffset + targetIndices[targetIndices.length-1]+1;
return new Span(startTargetIndex, endTargetIndex);
}
}
}