Package joshua.corpus

Examples of joshua.corpus.Corpus


    int sentenceNumber = 1;
    int endOfSentence = suffixes.getSentencePosition(sentenceNumber);

    if (logger.isLoggable(Level.FINEST)) logger.finest("END OF SENT: " + endOfSentence);

    Corpus corpus = suffixes.getCorpus();
    int endOfCorpus = corpus.size();
   
    // Start at the beginning of the corpus...
    for (int currentPosition : corpus.corpusPositions()) {
         
      // Start with a phrase length of 1, at the current position...
      for (int i = 1, endOfPhrase = currentPosition + i;
          // ...ensure the phrase length isn't too long...
          i <= maxPhraseLength  && 
View Full Code Here


  ) {
   
    PriorityQueue<Counted<Phrase>> frequentPhrases = new PriorityQueue<Counted<Phrase>>();
    Set<Integer> prunedFrequencies = new HashSet<Integer>();
   
    Corpus corpus = suffixes.getCorpus();
   
    FrequencyClasses frequencyClasses = getFrequencyClasses(suffixes);
   
    for (FrequencyClass frequencyClass : frequencyClasses.withMinimumFrequency(minFrequency)) {
     
View Full Code Here

   * @return Longest common prefix array
   */
  protected static int[] calculateLongestCommonPrefixes(Suffixes suffixes) {

    int length = suffixes.size();
    Corpus corpus = suffixes.getCorpus();

    int[] longestCommonPrefixes = new int[length +1];
   
    // For each element in the suffix array
    for (int i = 1; i < length; i++) {
      int corpusIndex = suffixes.getCorpusIndex(i);
      int prevCorpusIndex = suffixes.getCorpusIndex(i-1);

      // Start by assuming that the two positions
      //    don't have anything in common
      int commonPrefixSize = 0;
     
      // While the 1st position is not at the end of the corpus...
      while(corpusIndex+commonPrefixSize < length &&
          // ... and the 2nd position is not at the end of the corpus...
          prevCorpusIndex + commonPrefixSize < length &&
          // ... and the nth word at the 1st position ...
          (corpus.getWordID(corpusIndex  + commonPrefixSize) ==
            // ... is the same as the nth word at the 2nd position ...
            corpus.getWordID(prevCorpusIndex + commonPrefixSize) &&
            // ... and the length to consider isn't too long
            commonPrefixSize <= Suffixes.MAX_COMPARISON_LENGTH)) {
       
        // The two positions match for their respective nth words!
        // Increment commonPrefixSize to reflect this fact
View Full Code Here

 
 
  private Map<Phrase,InvertedIndex> calculateInvertedIndices() {
    Map<Phrase,InvertedIndex> invertedIndices = new HashMap<Phrase,InvertedIndex>(frequentPhrases.keySet().size());
   
    Corpus corpus = suffixes.getCorpus();
    int endOfCorpus = corpus.size();
    logger.fine("Corpus has size " + endOfCorpus);
   
    int sentenceNumber = 0;
    int endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
    boolean trackMe = false;
    // Start at the beginning of the corpus...
    for (int currentPosition : corpus.corpusPositions()) {
//         
      if (trackMe)
        {
        logger.fine("At corpus position " + currentPosition);
        }
View Full Code Here

 
  public static void main(String[] args) throws IOException, ClassNotFoundException {


    Vocabulary symbolTable;
    Corpus corpusArray;
    Suffixes suffixArray;
    FrequentPhrases frequentPhrases;

    if (args.length == 1) {
View Full Code Here

    FormatUtil.useUTF8();
 
    try {
     
      Vocabulary symbolTable;
      Corpus corpusArray;
      Suffixes suffixArray;
     
      logger.fine("Constructing vocabulary from file " + corpusFileName);
      symbolTable = new Vocabulary();
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);
View Full Code Here

TOP

Related Classes of joshua.corpus.Corpus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.