Examples of joshua.corpus.suffix_array.BasicPhrase

joshua.corpus.suffix_array.BasicPhrase
Phrase encapsulates an int[] of word IDs, and provides some basic functionality for manipulating phrases. @author Josh Schroeder @since 30 July 2003 @author Chris Callison-Burch @since 29 May 2008 @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $

    int numWords = 0;
    
    LineReader lineReader = new LineReader(inputFilename);
    
    for (String line : lineReader) {
      BasicPhrase sentence = new BasicPhrase(line, vocab);
      numWords += sentence.size();
      numSentences++;
      if(logger.isLoggable(Level.FINE) && numSentences % 10000==0) logger.fine(""+numWords);
    }
    
//    if (fixVocabulary) {

View Full Code Here

   */
  private void calculateSourceGivenTarget(Integer targetWord) {


    Map<Integer,Integer> counts = new HashMap<Integer,Integer>();
    
    int[] targetSuffixArrayBounds = targetSuffixArray.findPhrase(new BasicPhrase(targetVocab, targetWord));
    int step = (targetSuffixArrayBounds[1]-targetSuffixArrayBounds[0]<sampleSize) ? 1 : (targetSuffixArrayBounds[1]-targetSuffixArrayBounds[0]) / sampleSize;
    
    float total = 0;
    
    for (int targetSuffixArrayIndex=targetSuffixArrayBounds[0],samples=0; targetSuffixArrayIndex<=targetSuffixArrayBounds[1] && samples<sampleSize; targetSuffixArrayIndex+=step, samples++) {

View Full Code Here


    if (logger.isLoggable(Level.FINE)) logger.fine("Calculating lexprob distribution P( TARGET | " + sourceVocab.getWord(sourceWord) + "); sourceWord ID == " + sourceWord);
        
    Map<Integer,Integer> counts = new HashMap<Integer,Integer>();
    
    int[] sourceSuffixArrayBounds = sourceSuffixArray.findPhrase(new BasicPhrase(sourceVocab, sourceWord));
    int step = (sourceSuffixArrayBounds[1]-sourceSuffixArrayBounds[0]<sampleSize) ? 1 : (sourceSuffixArrayBounds[1]-sourceSuffixArrayBounds[0]) / sampleSize;
    
    float total = 0;
    
    for (int sourceSuffixArrayIndex=sourceSuffixArrayBounds[0],samples=0; sourceSuffixArrayIndex<=sourceSuffixArrayBounds[1] && samples<sampleSize; sourceSuffixArrayIndex+=step, samples++) {

View Full Code Here

    }


    sourceVocab = new Vocabulary(sourceWords);
    


    corpusSentence = new BasicPhrase(corpusString, sourceVocab);
    
    targetCorpusString = "das macht ihn und es beschädigt ihn , es setzt ihn auf und es führt ihn aus .";
    Set<String> targetWords = new HashSet<String>();
    for (String targetWord : targetCorpusString.split("\\s+")) {
      targetWords.add(targetWord);
    }
    
    targetVocab = new Vocabulary(targetWords);
    
    ntVocab = new HashMap<Integer,String>();
    ntVocab.put(-1, "X");
    
    {
      // create the suffix array...
      int[] sentenceStartPositions = {0};
      
      Assert.assertEquals(corpusSentence.size(), 18);
      
      int[] corpus = new int[corpusSentence.size()];
      for(int i = 0; i < corpusSentence.size(); i++) {
        corpus[i] = corpusSentence.getWordID(i);
      }
      
      CorpusArray corpusArray = new CorpusArray(corpus, sentenceStartPositions, sourceVocab);
      suffixArray = new SuffixArray(corpusArray);
      




      int[] targetSentenceStartPositions = {0};
      
      BasicPhrase targetCorpusSentence = new BasicPhrase(targetCorpusString, targetVocab);
      Assert.assertEquals(targetCorpusSentence.size(), 18);
      
      int[] targetCorpus = new int[targetCorpusSentence.size()];
      for(int i = 0; i < targetCorpusSentence.size(); i++) {
        targetCorpus[i] = targetCorpusSentence.getWordID(i);
      }
      


      


      
      CorpusArray targetCorpusArray = new CorpusArray(targetCorpus, targetSentenceStartPositions, targetVocab);
      targetSuffixArray = new SuffixArray(targetCorpusArray);


      
      int[] lowestAlignedTargetIndex = new int[corpusSentence.size()];
      int[] highestAlignedTargetIndex = new int[corpusSentence.size()];
      int[] lowestAlignedSourceIndex = new int[targetCorpusSentence.size()];
      int[] highestAlignedSourceIndex = new int[targetCorpusSentence.size()];
      
      int[][] alignedTargetIndices = new int[corpusSentence.size()][];
      int[][] alignedSourceIndices = new int[targetCorpusSentence.size()][];
      
      
      
      {
        for (int i=0; i<18; i++) {

View Full Code Here

    int sampleSize = 300;
    int minNonterminalSpan = 2;
//    RuleExtractor ruleExtractor = new HierarchicalRuleExtractor(suffixArray, targetCorpusArray, alignments, lexProbs, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    
    
    BasicPhrase query = new BasicPhrase("it makes him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him , it sets him on and it takes him off .", sourceVocab);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
//    simplePrefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    simplePrefixTree = new PrefixTree(parallelCorpus);
    simplePrefixTree.add(query.getWordIDs());
    
    Assert.assertNotNull(simplePrefixTree.root);
    Assert.assertNotNull(simplePrefixTree.root.children);
    
    /////////////////////////////

View Full Code Here

//    RuleExtractor ruleExtractor = new HierarchicalRuleExtractor(suffixArray, targetCorpusArray, alignments, lexProbs, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    
    
    //BasicPhrase query = new BasicPhrase("it makes him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him", sourceVocab);
    BasicPhrase query = new BasicPhrase("it makes him and it mars him , it sets him on and it takes him off .", sourceVocab);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
//    PrefixTree prefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    PrefixTree prefixTree = new PrefixTree(parallelCorpus);
    prefixTree.add(query.getWordIDs());
      
    Assert.assertNotNull(prefixTree.root);
    Assert.assertNotNull(prefixTree.root.children);
    
    /////////////////////////////

View Full Code Here

//    RuleExtractor ruleExtractor = new HierarchicalRuleExtractor(suffixArray, targetCorpusArray, alignments, lexProbs, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    
    
    String queryString = "it persuades him and it disheartens him";
    
    BasicPhrase querySentence = new BasicPhrase(queryString, sourceVocab);
    
    Assert.assertEquals(querySentence.toString(), "it UNK him and it UNK him");
    Assert.assertEquals(corpusSentence.toString(), corpusString);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);


//    PrefixTree prefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    PrefixTree prefixTree = new PrefixTree(parallelCorpus);
    prefixTree.add(querySentence.getWordIDs());


    
    
    Assert.assertTrue(prefixTree.root.children.containsKey(PrefixTree.X));
    Assert.assertTrue(prefixTree.root.children.containsKey(sourceVocab.getID("it")));

View Full Code Here

    
    // Adam Lopez's example...
    String corpusString = "it makes him and it mars him , it sets him on and it takes him off .";


    vocab = new Vocabulary();
    Phrase exampleSentence = new BasicPhrase(corpusString, vocab);
    
    exampleSentence = new BasicPhrase(corpusString, vocab);
    int[] sentences = new int[1];
    sentences[0] = 0;
    int[] corpus = new int[exampleSentence.size()];
    for(int i = 0; i < exampleSentence.size(); i++) {
      corpus[i] = exampleSentence.getWordID(i);
    }
    
    CorpusArray corpusArray = new CorpusArray(corpus, sentences, vocab);
    
    if (binaryFileName==null || binaryFileName.trim().length()==0)

View Full Code Here

  @Test
  public void findPhrase() {
    
    // Look up phrase "it makes him"
    
    Phrase phrase = new BasicPhrase("it makes him", vocab);
    int[] bounds = suffixArray.findPhrase(phrase);
    
    int expectedSuffixArrayStartIndex = 0;
    int expectedSuffixArrayEndIndex = 0;
    
    Assert.assertEquals(bounds.length, 2);
    Assert.assertEquals(bounds[0], expectedSuffixArrayStartIndex);
    Assert.assertEquals(bounds[1], expectedSuffixArrayEndIndex);
    
    
    // Look up phrase "and it"
    
    phrase = new BasicPhrase("and it", vocab);
    bounds = suffixArray.findPhrase(phrase);
    
    expectedSuffixArrayStartIndex = 9;
    expectedSuffixArrayEndIndex = 10;

View Full Code Here

TOP

Related Classes of joshua.corpus.suffix_array.BasicPhrase

joshua.corpus.lexprob.SampledLexProbs

joshua.corpus.Phrase

joshua.corpus.suffix_array.SuffixArrayTest

joshua.corpus.TerminalIterator

joshua.corpus.vocab.Vocabulary

joshua.prefix_tree.PrefixTreeAdvancedTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.