Package joshua.corpus.suffix_array

Source Code of joshua.corpus.suffix_array.PhrasePairCollocations

package joshua.corpus.suffix_array;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import joshua.corpus.Corpus;
import joshua.corpus.Phrase;
import joshua.corpus.vocab.SymbolTable;
import joshua.util.IntegerPair;
import joshua.util.Pair;

public class PhrasePairCollocations {
 
  private final Map<Pair<Phrase,Phrase>,List<IntegerPair>> phrasePairCollocations =
      new HashMap<Pair<Phrase,Phrase>,List<IntegerPair>>();
 
  private final Corpus corpus;
 
  public PhrasePairCollocations(Corpus corpus) {
    this.corpus = corpus;
  }
 
  void record(Phrase phrase1, Phrase phrase2, int position1, int position2) {
   
    Pair<Phrase,Phrase> phrasePair = new Pair<Phrase,Phrase>(phrase1,phrase2);
   
    if (! phrasePairCollocations.containsKey(phrasePair)) {
      phrasePairCollocations.put(phrasePair, new ArrayList<IntegerPair>());
    }
   
    List<IntegerPair> locations = phrasePairCollocations.get(phrasePair);
   
    locations.add(new IntegerPair(position1, position2));
  }
 
  List<HierarchicalPhrases> getHierarchicalPhrases() {
   
    SymbolTable vocab = corpus.getVocabulary();
    int X = vocab.addNonterminal("X");
   
    List<HierarchicalPhrases> result = new ArrayList<HierarchicalPhrases>();
   
    for (Map.Entry<Pair<Phrase,Phrase>,List<IntegerPair>> entry : phrasePairCollocations.entrySet()) {
      Pair<Phrase,Phrase> phrasePair = entry.getKey();
      int[] phrase1 = phrasePair.first.getWordIDs();
      int[] phrase2 = phrasePair.second.getWordIDs();
     
      int[] phrasePairTokens = new int[phrase1.length+1+phrase2.length]; {
        for (int index=0; index<phrase1.length; index++) {
          phrasePairTokens[index] = phrase1[index];
        }
        phrasePairTokens[phrase1.length] = X;
        for (int start=phrase1.length+1, index=start, end=start+phrase2.length; index<end; index++) {
          phrasePairTokens[index] = phrase2[index - start];
        }
      }
     
      List<IntegerPair> locations = entry.getValue();
     
      int[] startPositions = new int[locations.size()*2];
      int index=0;
        for (IntegerPair location : locations) {
          startPositions[index++] = location.first;
          startPositions[index++] = location.second;
        }
      }
     
      int[] sentenceNumbers = new int[locations.size()];
      int index=0;
        for (IntegerPair location : locations) {
          sentenceNumbers[index++] = corpus.getSentenceIndex(location.first);
        }
       
      }
     
      Pattern hierarchicalPhrase = new Pattern(vocab, phrasePairTokens);
      HierarchicalPhrases phraseLocations = new HierarchicalPhrases(hierarchicalPhrase,startPositions,sentenceNumbers);
      result.add(phraseLocations);
    }
   
    return result;
  }
 
}
TOP

Related Classes of joshua.corpus.suffix_array.PhrasePairCollocations

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.