Package joshua.corpus.suffix_array

Examples of joshua.corpus.suffix_array.BasicPhrase


    int numWords = 0;
   
    LineReader lineReader = new LineReader(inputFilename);
   
    for (String line : lineReader) {
      BasicPhrase sentence = new BasicPhrase(line, vocab);
      numWords += sentence.size();
      numSentences++;
      if(logger.isLoggable(Level.FINE) && numSentences % 10000==0) logger.fine(""+numWords);
    }
   
//    if (fixVocabulary) {
View Full Code Here


   */
  private void calculateSourceGivenTarget(Integer targetWord) {

    Map<Integer,Integer> counts = new HashMap<Integer,Integer>();
   
    int[] targetSuffixArrayBounds = targetSuffixArray.findPhrase(new BasicPhrase(targetVocab, targetWord));
    int step = (targetSuffixArrayBounds[1]-targetSuffixArrayBounds[0]<sampleSize) ? 1 : (targetSuffixArrayBounds[1]-targetSuffixArrayBounds[0]) / sampleSize;
   
    float total = 0;
   
    for (int targetSuffixArrayIndex=targetSuffixArrayBounds[0],samples=0; targetSuffixArrayIndex<=targetSuffixArrayBounds[1] && samples<sampleSize; targetSuffixArrayIndex+=step, samples++) {
View Full Code Here

    if (logger.isLoggable(Level.FINE)) logger.fine("Calculating lexprob distribution P( TARGET | " + sourceVocab.getWord(sourceWord) + "); sourceWord ID == " + sourceWord);
       
    Map<Integer,Integer> counts = new HashMap<Integer,Integer>();
   
    int[] sourceSuffixArrayBounds = sourceSuffixArray.findPhrase(new BasicPhrase(sourceVocab, sourceWord));
    int step = (sourceSuffixArrayBounds[1]-sourceSuffixArrayBounds[0]<sampleSize) ? 1 : (sourceSuffixArrayBounds[1]-sourceSuffixArrayBounds[0]) / sampleSize;
   
    float total = 0;
   
    for (int sourceSuffixArrayIndex=sourceSuffixArrayBounds[0],samples=0; sourceSuffixArrayIndex<=sourceSuffixArrayBounds[1] && samples<sampleSize; sourceSuffixArrayIndex+=step, samples++) {
View Full Code Here

    }

    sourceVocab = new Vocabulary(sourceWords);
   

    corpusSentence = new BasicPhrase(corpusString, sourceVocab);
   
    targetCorpusString = "das macht ihn und es beschädigt ihn , es setzt ihn auf und es führt ihn aus .";
    Set<String> targetWords = new HashSet<String>();
    for (String targetWord : targetCorpusString.split("\\s+")) {
      targetWords.add(targetWord);
    }
   
    targetVocab = new Vocabulary(targetWords);
   
    ntVocab = new HashMap<Integer,String>();
    ntVocab.put(-1, "X");
   
    {
      // create the suffix array...
      int[] sentenceStartPositions = {0};
     
      Assert.assertEquals(corpusSentence.size(), 18);
     
      int[] corpus = new int[corpusSentence.size()];
      for(int i = 0; i < corpusSentence.size(); i++) {
        corpus[i] = corpusSentence.getWordID(i);
      }
     
      CorpusArray corpusArray = new CorpusArray(corpus, sentenceStartPositions, sourceVocab);
      suffixArray = new SuffixArray(corpusArray);
     


      int[] targetSentenceStartPositions = {0};
     
      BasicPhrase targetCorpusSentence = new BasicPhrase(targetCorpusString, targetVocab);
      Assert.assertEquals(targetCorpusSentence.size(), 18);
     
      int[] targetCorpus = new int[targetCorpusSentence.size()];
      for(int i = 0; i < targetCorpusSentence.size(); i++) {
        targetCorpus[i] = targetCorpusSentence.getWordID(i);
      }
     

     

     
      CorpusArray targetCorpusArray = new CorpusArray(targetCorpus, targetSentenceStartPositions, targetVocab);
      targetSuffixArray = new SuffixArray(targetCorpusArray);

     
      int[] lowestAlignedTargetIndex = new int[corpusSentence.size()];
      int[] highestAlignedTargetIndex = new int[corpusSentence.size()];
      int[] lowestAlignedSourceIndex = new int[targetCorpusSentence.size()];
      int[] highestAlignedSourceIndex = new int[targetCorpusSentence.size()];
     
      int[][] alignedTargetIndices = new int[corpusSentence.size()][];
      int[][] alignedSourceIndices = new int[targetCorpusSentence.size()][];
     
     
     
      {
        for (int i=0; i<18; i++) {
View Full Code Here

    int sampleSize = 300;
    int minNonterminalSpan = 2;
//    RuleExtractor ruleExtractor = new HierarchicalRuleExtractor(suffixArray, targetCorpusArray, alignments, lexProbs, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
   
   
    BasicPhrase query = new BasicPhrase("it makes him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him , it sets him on and it takes him off .", sourceVocab);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
//    simplePrefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    simplePrefixTree = new PrefixTree(parallelCorpus);
    simplePrefixTree.add(query.getWordIDs());
   
    Assert.assertNotNull(simplePrefixTree.root);
    Assert.assertNotNull(simplePrefixTree.root.children);
   
    /////////////////////////////
View Full Code Here

//    RuleExtractor ruleExtractor = new HierarchicalRuleExtractor(suffixArray, targetCorpusArray, alignments, lexProbs, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
   
   
    //BasicPhrase query = new BasicPhrase("it makes him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him", sourceVocab);
    BasicPhrase query = new BasicPhrase("it makes him and it mars him , it sets him on and it takes him off .", sourceVocab);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
//    PrefixTree prefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    PrefixTree prefixTree = new PrefixTree(parallelCorpus);
    prefixTree.add(query.getWordIDs());
     
    Assert.assertNotNull(prefixTree.root);
    Assert.assertNotNull(prefixTree.root.children);
   
    /////////////////////////////
View Full Code Here

//    RuleExtractor ruleExtractor = new HierarchicalRuleExtractor(suffixArray, targetCorpusArray, alignments, lexProbs, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
   
   
    String queryString = "it persuades him and it disheartens him";
   
    BasicPhrase querySentence = new BasicPhrase(queryString, sourceVocab);
   
    Assert.assertEquals(querySentence.toString(), "it UNK him and it UNK him");
    Assert.assertEquals(corpusSentence.toString(), corpusString);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);

//    PrefixTree prefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    PrefixTree prefixTree = new PrefixTree(parallelCorpus);
    prefixTree.add(querySentence.getWordIDs());

   
   
    Assert.assertTrue(prefixTree.root.children.containsKey(PrefixTree.X));
    Assert.assertTrue(prefixTree.root.children.containsKey(sourceVocab.getID("it")));
View Full Code Here

   
    // Adam Lopez's example...
    String corpusString = "it makes him and it mars him , it sets him on and it takes him off .";

    vocab = new Vocabulary();
    Phrase exampleSentence = new BasicPhrase(corpusString, vocab);
   
    exampleSentence = new BasicPhrase(corpusString, vocab);
    int[] sentences = new int[1];
    sentences[0] = 0;
    int[] corpus = new int[exampleSentence.size()];
    for(int i = 0; i < exampleSentence.size(); i++) {
      corpus[i] = exampleSentence.getWordID(i);
    }
   
    CorpusArray corpusArray = new CorpusArray(corpus, sentences, vocab);
   
    if (binaryFileName==null || binaryFileName.trim().length()==0)
View Full Code Here

  @Test
  public void findPhrase() {
   
    // Look up phrase "it makes him"
   
    Phrase phrase = new BasicPhrase("it makes him", vocab);
    int[] bounds = suffixArray.findPhrase(phrase);
   
    int expectedSuffixArrayStartIndex = 0;
    int expectedSuffixArrayEndIndex = 0;
   
    Assert.assertEquals(bounds.length, 2);
    Assert.assertEquals(bounds[0], expectedSuffixArrayStartIndex);
    Assert.assertEquals(bounds[1], expectedSuffixArrayEndIndex);
   
   
    // Look up phrase "and it"
   
    phrase = new BasicPhrase("and it", vocab);
    bounds = suffixArray.findPhrase(phrase);
   
    expectedSuffixArrayStartIndex = 9;
    expectedSuffixArrayEndIndex = 10;
   
View Full Code Here

TOP

Related Classes of joshua.corpus.suffix_array.BasicPhrase

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.