Examples of joshua.corpus.suffix_array.ParallelCorpusGrammarFactory

joshua.corpus.suffix_array.ParallelCorpusGrammarFactory
Aligned parallel corpus, capable of extracting a sentence-specific translation grammar.
The source side of the aligned parallel corpus is backed by a suffix array. @author Lane Schwartz

      frequentPhrases.cacheInvertedIndices();
    }




    logger.info("Constructing grammar factory from parallel corpus");
    ParallelCorpusGrammarFactory parallelCorpus;
    if (binaryCorpus) {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing lexical translation probabilities from binary file " + binaryLexCountsFilename);
      parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixArray, targetSuffixArray, alignments, null, binaryLexCountsFilename, ruleSampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
    } else { 
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing lexical translation probabilities from parallel corpus"); 
      parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixArray, targetSuffixArray, alignments, null, ruleSampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
    }
    return parallelCorpus;
  }

View Full Code Here

    } else {
      out = new PrintStream(outputFile,"UTF-8");
      logger.info("Rules will be written to " + outputFile);
    }
    
    ParallelCorpusGrammarFactory parallelCorpus = this.getGrammarFactory();
    
    logger.info("Getting symbol table");
    SymbolTable sourceVocab = parallelCorpus.getSourceCorpus().getVocabulary();
    
    int lineNumber = 0;
    boolean oneTreePerSentence = ! this.keepTree;
    
    logger.info("Will read test sentences from " + testFileName);
    Scanner testFileScanner = new Scanner(new File(testFileName), encoding);
    
    logger.info("Read test sentences from " + testFileName);
    PrefixTree prefixTree = null;
    while (testFileScanner.hasNextLine() && (lineNumber-startingSentence+1)<maxTestSentences) {


      String line = testFileScanner.nextLine();
      lineNumber++;
      if (lineNumber < startingSentence) continue;
      
      int[] words = sourceVocab.getIDs(line);
      
      if (oneTreePerSentence || null==prefixTree) 
      {
//        prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
        if (logger.isLoggable(Level.INFO)) logger.info("Constructing new prefix tree");
        Node.resetNodeCounter();
        prefixTree = new PrefixTree(parallelCorpus);
        prefixTree.setPrintStream(out);
        prefixTree.sentenceInitialX = this.sentenceInitialX;
        prefixTree.sentenceFinalX   = this.sentenceFinalX;
        prefixTree.edgeXMayViolatePhraseSpan = this.edgeXViolates;
      }
      try {
        if (logger.isLoggable(Level.INFO)) logger.info("Processing source line " + lineNumber + ": " + line);
        prefixTree.add(words);
      } catch (OutOfMemoryError e) {
        logger.warning("Out of memory - attempting to clear cache to free space");
        parallelCorpus.getSuffixArray().getCachedHierarchicalPhrases().clear();
//        targetSuffixArray.getCachedHierarchicalPhrases().clear();
        prefixTree = null;
        System.gc();
        logger.info("Cleared cache and collected garbage. Now attempting to re-construct prefix tree...");
//        prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);

View Full Code Here

        if (JoshuaConfiguration.tm_file.endsWith(".josh")) {
          
          try {
            // Use corpus-based grammar
            //inside getParallelCorpus, we will initialize symboltable, lm, and feature functions
            ParallelCorpusGrammarFactory parallelCorpus = getParallelCorpus(configFile);              
            grammarFactories.add(parallelCorpus);
            
          } catch (Exception e) {
            IOException ioe = new IOException("Error reading suffix array grammar.");
            ioe.initCause(e);

View Full Code Here

          binaryAlignmentFileName,
          sourceCorpusArray,
          targetCorpusArray);
        
    // Finally, add the parallel corpus that will serve as a grammar
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(
        sourceSuffixArray,
        targetSuffixArray,
        alignments,
        this.featureFunctions,
        JoshuaConfiguration.sa_rule_sample_size,

View Full Code Here

    CorpusArray targetCorpus = new CorpusArray(sentenceF, sentenceStartPositions, vocab);
    SuffixArray targetSuffixes = new SuffixArray(targetCorpus);


    CorpusArray sourceCorpus = new CorpusArray(sentence, sentenceStartPositions, vocab);
    SuffixArray sourceSuffixes = new SuffixArray(sourceCorpus);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixes, targetSuffixes, alignments, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
    
//    tree = new PrefixTree(vocab, maxPhraseSpan, maxPhraseLength, maxNonterminals);
    tree = new PrefixTree(parallelCorpus);
    Assert.assertNotNull(tree);

View Full Code Here

    
    for (int i=0; i<numIterations; i++) {
      logger.info("Extracting rules for sentence " + (i+1) + ".");
      long startTime1 = System.currentTimeMillis();
      {
        ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixArray, targetSuffixArray, alignments, null, ruleSampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);


//        PrefixTree prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
        PrefixTree prefixTree = new PrefixTree(parallelCorpus);
        
        prefixTree.sentenceInitialX = true;

View Full Code Here

   * @param maxPhraseSpan
   * @param maxPhraseLength
   * @param maxNonterminals
   */
  PrefixTree(SymbolTable vocab, int maxPhraseSpan, int maxPhraseLength, int maxNonterminals) {
    this(new ParallelCorpusGrammarFactory((Suffixes) null, (Suffixes) null, (Alignments) null, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost));
  }

View Full Code Here

    
    
    BasicPhrase query = new BasicPhrase("it makes him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him , it sets him on and it takes him off .", sourceVocab);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
//    simplePrefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    simplePrefixTree = new PrefixTree(parallelCorpus);
    simplePrefixTree.add(query.getWordIDs());
    
    Assert.assertNotNull(simplePrefixTree.root);

View Full Code Here

    
    
    //BasicPhrase query = new BasicPhrase("it makes him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him", sourceVocab);
    BasicPhrase query = new BasicPhrase("it makes him and it mars him , it sets him on and it takes him off .", sourceVocab);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
//    PrefixTree prefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    PrefixTree prefixTree = new PrefixTree(parallelCorpus);
    prefixTree.add(query.getWordIDs());
      
    Assert.assertNotNull(prefixTree.root);

View Full Code Here

    
    BasicPhrase querySentence = new BasicPhrase(queryString, sourceVocab);
    
    Assert.assertEquals(querySentence.toString(), "it UNK him and it UNK him");
    Assert.assertEquals(corpusSentence.toString(), corpusString);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);


//    PrefixTree prefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    PrefixTree prefixTree = new PrefixTree(parallelCorpus);
    prefixTree.add(querySentence.getWordIDs());

View Full Code Here

TOP

Related Classes of joshua.corpus.suffix_array.ParallelCorpusGrammarFactory

joshua.corpus.lexprob.LexProbs

joshua.decoder.JoshuaDecoder

joshua.prefix_tree.ExtractRuleProfiler

joshua.prefix_tree.ExtractRules

joshua.prefix_tree.HierarchicalRuleExtractor

joshua.prefix_tree.PrefixTree

joshua.prefix_tree.PrefixTreeAdvancedTest

joshua.prefix_tree.PrefixTreeTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.