/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import joshua.corpus.AbstractPhrase;
import joshua.corpus.Phrase;
import joshua.corpus.TerminalIterator;
import joshua.corpus.vocab.SymbolTable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Phrase encapsulates an int[] of word IDs, and provides some basic
* functionality for manipulating phrases.
*
* @author Josh Schroeder
* @since 30 July 2003
* @author Chris Callison-Burch
* @since 29 May 2008
* @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
*/
public class BasicPhrase extends AbstractPhrase {
//===============================================================
// Member variables
//===============================================================
protected SymbolTable vocab;
protected int[] words;
//===============================================================
// Constructor(s)
//===============================================================
/**
* Constructs a basic phrase.
*
* @param words Array of word symbols for this phrase.
* @param vocab Symbol table to use for this phrase.
*/
public BasicPhrase(int[] words, SymbolTable vocab) {
this.vocab = vocab;
this.words = words;
}
/**
* Constructs a basic phrase.
*
* @param vocab Symbol table to use for this phrase.
* @param words Array of word symbols for this phrase.
*/
public BasicPhrase(SymbolTable vocab, int...words) {
this.vocab = vocab;
this.words = words;
}
/**
* Constructor tokenizes the phrase string at whitespace
* characters and looks up the IDs of the words using the
* Vocabulary.
*
* @param phraseString a String of the format "Hello , world ."
* @param vocab Symbol table to use for this phrase
*/
public BasicPhrase(String phraseString, SymbolTable vocab) {
this.vocab = vocab;
String[] wordStrings = phraseString.split("\\s+");
words = new int[wordStrings.length];
for (int i = 0; i < wordStrings.length; i++) {
words[i] = vocab.addTerminal(wordStrings[i]);
}
}
//===============================================================
// Public
//===============================================================
//===========================================================
// Accessor methods (set/get)
//===========================================================
/**
* Gets the symbol table associated with this phrase.
*
* @return the vocabulary that the words in this phrase are
* drawn from.
*/
public SymbolTable getVocab() {
return vocab;
}
/**
* Gets the integer identifier for the word at the specified
* position.
*
* @param position Index into the corpus
* @return the integer identifier for the word at the
* specified position
*/
public int getWordID(int position) {
return words[position];
}
/**
* Gets the number of tokens in this phrase.
*
* @return the number of tokens in this phrase
*/
public int size() {
return words.length;
}
/**
* This method gets the integer IDs of the phrase as an
* array of ints.
* <p>
* This method does <emph>not</emph> copy the array, and
* so may be called very cheaply.
*
* @return an int[] corresponding to the ID of each word
* in the phrase
*/
public int[] getWordIDs() {
return words;
}
//===========================================================
// Methods
//===========================================================
/**
* Gets a space-delimited string representing the words in
* this Phrase
*
* @return a space-delimited string of the words in this
* Phrase
*/
public String toString() {
StringBuffer buf = new StringBuffer();
for (int i = 0; i < size(); i++) {
String word = vocab.getWord(words[i]);
buf.append(word);
if (i < size() - 1) {
buf.append(' ');
}
}
return buf.toString();
}
/**
* Gets all possible subphrases of this phrase, up to and
* including the phrase itself. For example, the phrase "I
* like cheese ." would return the following:
* <ul>
* <li>I
* <li>like
* <li>cheese
* <li>.
* <li>I like
* <li>like cheese
* <li>cheese .
* <li>I like cheese
* <li>like cheese .
* <li>I like cheese .
* </ul>
*
* @return List of all possible subphrases.
*/
public List<Phrase> getSubPhrases() {
return getSubPhrases(size());
}
/**
* Returns a list of subphrases only of length
* <code>maxLength</code> or smaller.
*
* @param maxLength the maximum length phrase to return.
* @return List of all possible subphrases of length maxLength
* or less
* @see #getSubPhrases()
*/
public List<Phrase> getSubPhrases(int maxLength) {
if (maxLength > size()) {
return getSubPhrases(size());
}
List<Phrase> phrases = new ArrayList<Phrase>();
for (int i = 0; i < size(); i++) {
for (int j = i + 1; (j <= size()) && (j - i <= maxLength); j++) {
Phrase subPhrase = subPhrase(i,j);
phrases.add(subPhrase);
}
}
return phrases;
}
/**
* Creates a new phrase object from the indexes provided.
* <P>
* NOTE: subList merely creates a "view" of the existing
* Phrase object. Memory taken up by other Words in the
* Phrase is not freed since the underlying subList object
* still points to the complete Phrase List.
*
* @param start Inclusive starting index
* @param end Exclusive ending index
* @return Phrase object representing the specified range
* @see java.util.ArrayList#subList(int, int)
*/
public Phrase subPhrase(int start, int end) {
int subPhraseLength = end - start;
int[] subPhraseWords = new int[subPhraseLength];
for (int i = 0; i < subPhraseLength; i++) {
subPhraseWords[i] = words[i+start];
}
return new BasicPhrase(subPhraseWords, vocab);
}
/**
* Gets an object capable of iterating over all terminals in this pattern.
*
* @return an object capable of iterating
* over all terminals in this pattern
*/
public Iterable<Integer> getTerminals() {
return new Iterable<Integer>() {
public Iterator<Integer> iterator() {
return new TerminalIterator(vocab,words);
}
};
}
/**
* Compares the two strings based on the lexicographic order
* of words defined in the Vocabulary.
*
* @param other the object to compare to
* @return -1 if this object is less than the parameter, 0
* if equals, 1 if greater
*/
public int compareTo(Phrase other) {
for (int i = 0; i < words.length; i++) {
if (i < other.size()) {
int difference = words[i] - other.getWordID(i);
if (difference != 0) {
return difference;
}
} else {
//same but other is shorter, so we are after
return 1;
}
}
if (size() < other.size()) {
return -1;
} else {
return 0;
}
}
//===============================================================
// Private
//===============================================================
}