Package joshua.corpus.suffix_array

Examples of joshua.corpus.suffix_array.ParallelCorpusGrammarFactory


      frequentPhrases.cacheInvertedIndices();
    }


    logger.info("Constructing grammar factory from parallel corpus");
    ParallelCorpusGrammarFactory parallelCorpus;
    if (binaryCorpus) {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing lexical translation probabilities from binary file " + binaryLexCountsFilename);
      parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixArray, targetSuffixArray, alignments, null, binaryLexCountsFilename, ruleSampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
    } else {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing lexical translation probabilities from parallel corpus");
      parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixArray, targetSuffixArray, alignments, null, ruleSampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
    }
    return parallelCorpus;
  }
View Full Code Here


    } else {
      out = new PrintStream(outputFile,"UTF-8");
      logger.info("Rules will be written to " + outputFile);
    }
   
    ParallelCorpusGrammarFactory parallelCorpus = this.getGrammarFactory();
   
    logger.info("Getting symbol table");
    SymbolTable sourceVocab = parallelCorpus.getSourceCorpus().getVocabulary();
   
    int lineNumber = 0;
    boolean oneTreePerSentence = ! this.keepTree;
   
    logger.info("Will read test sentences from " + testFileName);
    Scanner testFileScanner = new Scanner(new File(testFileName), encoding);
   
    logger.info("Read test sentences from " + testFileName);
    PrefixTree prefixTree = null;
    while (testFileScanner.hasNextLine() && (lineNumber-startingSentence+1)<maxTestSentences) {

      String line = testFileScanner.nextLine();
      lineNumber++;
      if (lineNumber < startingSentence) continue;
     
      int[] words = sourceVocab.getIDs(line);
     
      if (oneTreePerSentence || null==prefixTree)
      {
//        prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
        if (logger.isLoggable(Level.INFO)) logger.info("Constructing new prefix tree");
        Node.resetNodeCounter();
        prefixTree = new PrefixTree(parallelCorpus);
        prefixTree.setPrintStream(out);
        prefixTree.sentenceInitialX = this.sentenceInitialX;
        prefixTree.sentenceFinalX   = this.sentenceFinalX;
        prefixTree.edgeXMayViolatePhraseSpan = this.edgeXViolates;
      }
      try {
        if (logger.isLoggable(Level.INFO)) logger.info("Processing source line " + lineNumber + ": " + line);
        prefixTree.add(words);
      } catch (OutOfMemoryError e) {
        logger.warning("Out of memory - attempting to clear cache to free space");
        parallelCorpus.getSuffixArray().getCachedHierarchicalPhrases().clear();
//        targetSuffixArray.getCachedHierarchicalPhrases().clear();
        prefixTree = null;
        System.gc();
        logger.info("Cleared cache and collected garbage. Now attempting to re-construct prefix tree...");
//        prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
View Full Code Here

        if (JoshuaConfiguration.tm_file.endsWith(".josh")) {
         
          try {
            // Use corpus-based grammar
            //inside getParallelCorpus, we will initialize symboltable, lm, and feature functions
            ParallelCorpusGrammarFactory parallelCorpus = getParallelCorpus(configFile);             
            grammarFactories.add(parallelCorpus);
           
          } catch (Exception e) {
            IOException ioe = new IOException("Error reading suffix array grammar.");
            ioe.initCause(e);
View Full Code Here

          binaryAlignmentFileName,
          sourceCorpusArray,
          targetCorpusArray);
       
    // Finally, add the parallel corpus that will serve as a grammar
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(
        sourceSuffixArray,
        targetSuffixArray,
        alignments,
        this.featureFunctions,
        JoshuaConfiguration.sa_rule_sample_size,
View Full Code Here

    CorpusArray targetCorpus = new CorpusArray(sentenceF, sentenceStartPositions, vocab);
    SuffixArray targetSuffixes = new SuffixArray(targetCorpus);

    CorpusArray sourceCorpus = new CorpusArray(sentence, sentenceStartPositions, vocab);
    SuffixArray sourceSuffixes = new SuffixArray(sourceCorpus);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixes, targetSuffixes, alignments, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
   
//    tree = new PrefixTree(vocab, maxPhraseSpan, maxPhraseLength, maxNonterminals);
    tree = new PrefixTree(parallelCorpus);
    Assert.assertNotNull(tree);
   
View Full Code Here

   
    for (int i=0; i<numIterations; i++) {
      logger.info("Extracting rules for sentence " + (i+1) + ".");
      long startTime1 = System.currentTimeMillis();
      {
        ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixArray, targetSuffixArray, alignments, null, ruleSampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);

//        PrefixTree prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
        PrefixTree prefixTree = new PrefixTree(parallelCorpus);
       
        prefixTree.sentenceInitialX = true;
View Full Code Here

   * @param maxPhraseSpan
   * @param maxPhraseLength
   * @param maxNonterminals
   */
  PrefixTree(SymbolTable vocab, int maxPhraseSpan, int maxPhraseLength, int maxNonterminals) {
    this(new ParallelCorpusGrammarFactory((Suffixes) null, (Suffixes) null, (Alignments) null, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost));
  }
View Full Code Here

   
   
    BasicPhrase query = new BasicPhrase("it makes him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him , it sets him on and it takes him off .", sourceVocab);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
//    simplePrefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    simplePrefixTree = new PrefixTree(parallelCorpus);
    simplePrefixTree.add(query.getWordIDs());
   
    Assert.assertNotNull(simplePrefixTree.root);
View Full Code Here

   
   
    //BasicPhrase query = new BasicPhrase("it makes him", sourceVocab);
    //BasicPhrase query = new BasicPhrase("it makes him and it mars him", sourceVocab);
    BasicPhrase query = new BasicPhrase("it makes him and it mars him , it sets him on and it takes him off .", sourceVocab);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
//    PrefixTree prefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    PrefixTree prefixTree = new PrefixTree(parallelCorpus);
    prefixTree.add(query.getWordIDs());
     
    Assert.assertNotNull(prefixTree.root);
View Full Code Here

   
    BasicPhrase querySentence = new BasicPhrase(queryString, sourceVocab);
   
    Assert.assertEquals(querySentence.toString(), "it UNK him and it UNK him");
    Assert.assertEquals(corpusSentence.toString(), corpusString);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(suffixArray, targetSuffixArray, alignments, null, sampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);

//    PrefixTree prefixTree = new PrefixTree(suffixArray, targetCorpusArray, alignments, suffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
    PrefixTree prefixTree = new PrefixTree(parallelCorpus);
    prefixTree.add(querySentence.getWordIDs());
View Full Code Here

TOP

Related Classes of joshua.corpus.suffix_array.ParallelCorpusGrammarFactory

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.