Examples of joshua.corpus.Corpus

joshua.corpus.Corpus
Corpus is an interface that contains methods for accessing the information within a monolingual corpus. @author Chris Callison-Burch @since 7 February 2005 @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $

      Vocabulary symbolTable = new Vocabulary();
      int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, symbolTable, true);
      Assert.assertEquals(sourceLengths.length, 2);
      int numberOfSentences = sourceLengths[1];
      
      Corpus sourceCorpus = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, symbolTable, sourceLengths[0], sourceLengths[1]);
    
      
      // Set up target corpus
      File targetFile = File.createTempFile("target", new Date().toString());
      PrintStream targetPrintStream = new PrintStream(targetFile, "UTF-8");
      for (String sentence : targetSentences) {
        targetPrintStream.println(sentence);
      }
      targetPrintStream.close();
      String targetCorpusFileName = targetFile.getAbsolutePath();
      
      int[] targetLengths = Vocabulary.initializeVocabulary(targetCorpusFileName, symbolTable, true);
      Assert.assertEquals(targetLengths.length, sourceLengths.length);
      for (int i=0, n=targetLengths.length; i<n; i++) {
        Assert.assertEquals(targetLengths[i], sourceLengths[i]);
      }
      
      Corpus targetCorpus = SuffixArrayFactory.createCorpusArray(targetCorpusFileName, symbolTable, targetLengths[0], targetLengths[1]);
      
      
      // Construct alignments data structure
      File alignmentsFile = File.createTempFile("alignments", new Date().toString());
      PrintStream alignmentsPrintStream = new PrintStream(alignmentsFile, "UTF-8");

View Full Code Here

    
    
    //////////////////////////////////
    // Source language corpus array //
    //////////////////////////////////
    final Corpus sourceCorpusArray;
    if (binaryCorpus) {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing memory mapped source language corpus array.");
      sourceCorpusArray = new MemoryMappedCorpusArray(commonVocab, sourceFileName);
    } else {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing source language corpus array.");
      sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceFileName, commonVocab, numSourceWords, numSourceSentences);
    }


    //////////////////////////////////
    // Source language suffix array //
    //////////////////////////////////
    Suffixes sourceSuffixArray;
    String binarySourceSuffixArrayFileName = sourceSuffixesFileName;
    if (binaryCorpus) {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing source language suffix array from binary file " + binarySourceSuffixArrayFileName);
      sourceSuffixArray = new MemoryMappedSuffixArray(binarySourceSuffixArrayFileName, sourceCorpusArray, cacheSize);
    } else {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing source language suffix array from source corpus.");
      sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, cacheSize);
    }
    
    


        
    //////////////////////////////////
    // Target language corpus array //
    //////////////////////////////////
    final Corpus targetCorpusArray;
    if (binaryCorpus) {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing memory mapped target language corpus array.");
      targetCorpusArray = new MemoryMappedCorpusArray(commonVocab, targetFileName);
    } else {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing target language corpus array.");
      targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetFileName, commonVocab, numTargetWords, numTargetSentences);
    }
    


    //////////////////////////////////
    // Target language suffix array //
    //////////////////////////////////
    Suffixes targetSuffixArray;
    String binaryTargetSuffixArrayFileName = targetSuffixesFileName;
    if (binaryCorpus) {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing target language suffix array from binary file " + binaryTargetSuffixArrayFileName);
      targetSuffixArray = new MemoryMappedSuffixArray(binaryTargetSuffixArrayFileName, targetCorpusArray, cacheSize);
    } else {
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing target language suffix array from target corpus.");
      targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, cacheSize);
    }


    int trainingSize = sourceCorpusArray.getNumSentences();
    if (trainingSize != targetCorpusArray.getNumSentences()) {
      throw new RuntimeException("Source and target corpora have different number of sentences. This is bad.");
    }
    
    
    /////////////////////

View Full Code Here

        maxPhraseSpan, maxPhraseLength, 
        minNonterminalSpan, maxNonterminalSpan);
    
    SymbolTable vocab = new Vocabulary();
    
    Corpus corpus = suffixArray.getCorpus();
    
    NGramLanguageModel largeLM = new LMGrammarJAVA(
        vocab,
        lmOrder,
        largeArpaLM,
        JoshuaConfiguration.use_left_equivalent_state,
        JoshuaConfiguration.use_right_equivalent_state);
    
    NGramLanguageModel testLM = new LMGrammarJAVA(
        vocab,
        lmOrder,
        testArpaLM,
        JoshuaConfiguration.use_left_equivalent_state,
        JoshuaConfiguration.use_right_equivalent_state);
    
    this.weights = new float[corpus.getNumSentences()];
    
    for (int i=0, n=corpus.getNumSentences(); i<n; i++) {
      Phrase sentence = corpus.getSentence(i);
      int[] words = sentence.getWordIDs();
      double largeProbLM = largeLM.ngramLogProbability(words);
      double testProbLM = testLM.ngramLogProbability(words);
      double ratio = testProbLM - largeProbLM;
      this.weights[i] = (float) ratio;

View Full Code Here

    if (logger.isLoggable(Level.FINE)) {
      logger.fine("Counting word co-occurrence from parallel corpus. Using floor probability " + floorProbability);
    }
    
    Alignments alignments = parallelCorpus.getAlignments();
    Corpus sourceCorpus = parallelCorpus.getSourceCorpus();
    Corpus targetCorpus = parallelCorpus.getTargetCorpus();
    int numSentences = parallelCorpus.getNumSentences();
    
    Counts<Integer,Integer> counts = new Counts<Integer,Integer>(floorProbability);
    
    // Iterate over each sentence
    for (int sentenceID=0; sentenceID<numSentences; sentenceID++) {


      int sourceStart = sourceCorpus.getSentencePosition(sentenceID);
      int sourceEnd = sourceCorpus.getSentenceEndPosition(sentenceID);


      int targetStart = targetCorpus.getSentencePosition(sentenceID);
      int targetEnd = targetCorpus.getSentenceEndPosition(sentenceID);


      // Iterate over each word in the source sentence
      for (int sourceIndex=sourceStart; sourceIndex<sourceEnd; sourceIndex++) {


        // Get the token for the current source word
        int sourceWord = sourceCorpus.getWordID(sourceIndex);
        
        // Get the target indices aligned to this source word
        int[] targetPoints = alignments.getAlignedTargetIndices(sourceIndex);
        
        // If the source word is unaligned,
        // then we treat it as being aligned to a special NULL token;
        // we use Java's null to represent the NULL token
        if (targetPoints==null) {
          
          counts.incrementCount(sourceWord, null);
          
        } else {
          
          // If the source word is aligned,
          // then we must iterate over each aligned target point
          for (int targetPoint : targetPoints) {


            int targetWord = targetCorpus.getWordID(targetPoint);


            counts.incrementCount(sourceWord, targetWord);
          }
        }
        
      }
      
      // Iterate over each word in the target sentence
      for (int targetIndex=targetStart; targetIndex<targetEnd; targetIndex++) {


        // Get the token for the current source word
        int targetWord = targetCorpus.getWordID(targetIndex);
        
        // Get the source indices aligned to this target word
        int[] sourcePoints = alignments.getAlignedSourceIndices(targetIndex);
        
        // If the source word is unaligned,

View Full Code Here

  /* See Javadoc for LexicalProbabilities#lexProbSourceGivenTarget(MatchedHierarchicalPhrases,int,HierarchicalPhrase). */
  public float lexProbSourceGivenTarget(MatchedHierarchicalPhrases sourcePhrases, int sourcePhraseIndex, HierarchicalPhrase targetPhrase) {
    
    float sourceGivenTarget = 1.0f;
    
    Corpus sourceCorpus = parallelCorpus.getSourceCorpus();
    Corpus targetCorpus = parallelCorpus.getTargetCorpus();
    Alignments alignments = parallelCorpus.getAlignments();
    
    // Iterate over each terminal sequence in the source phrase
    for (int seq=0; seq<sourcePhrases.getNumberOfTerminalSequences(); seq++) {
      
      // Iterate over each source index in the current terminal sequence
      for (int sourceWordIndex=sourcePhrases.getTerminalSequenceStartIndex(sourcePhraseIndex, seq),
            end=sourcePhrases.getTerminalSequenceEndIndex(sourcePhraseIndex, seq);
          sourceWordIndex<end; 
          sourceWordIndex++) {
        
                
        int sourceWord = sourceCorpus.getWordID(sourceWordIndex);
        int[] targetIndices = alignments.getAlignedTargetIndices(sourceWordIndex);
        
        float sum = 0.0f;
        float average;
        
        if (targetIndices==null) {
          
          sum += this.sourceGivenTarget(sourceWord, null);
          average = sum;
          
        } else {
          for (int targetIndex : targetIndices) {


            int targetWord = targetCorpus.getWordID(targetIndex);
            sum += sourceGivenTarget(sourceWord, targetWord);
            
          }
          average = sum / targetIndices.length;
        }

View Full Code Here

  /* See Javadoc for LexicalProbabilities#lexProbTargetGivenSource(MatchedHierarchicalPhrases,int,HierarchicalPhrase). */
  public float lexProbTargetGivenSource(MatchedHierarchicalPhrases sourcePhrases, int sourcePhraseIndex, HierarchicalPhrase targetPhrase) {
    
    final boolean LOGGING_FINEST = logger.isLoggable(Level.FINEST);
    
    Corpus sourceCorpus = parallelCorpus.getSourceCorpus();
    Corpus targetCorpus = parallelCorpus.getTargetCorpus();
    Alignments alignments = parallelCorpus.getAlignments();
    
    StringBuilder s;
    if (LOGGING_FINEST) {
      s = new StringBuilder();
      s.append("lexProb( ");
      s.append(sourcePhrases.getPattern().toString());
      s.append(" | ");
      s.append(targetPhrase.toString());
      s.append(" )  =  1.0");
    } else {
      s = null;
    }
    
    float targetGivenSource = 1.0f;


    // Iterate over each terminal sequence in the target phrase
    for (int seq=0; seq<targetPhrase.getNumberOfTerminalSequences(); seq++) {
      
      // Iterate over each source index in the current terminal sequence
      for (int targetWordIndex=targetPhrase.getTerminalSequenceStartIndex(seq),
            end=targetPhrase.getTerminalSequenceEndIndex(seq);
          targetWordIndex<end; 
          targetWordIndex++) {
        
        int targetWord = targetCorpus.getWordID(targetWordIndex);
        int[] sourceIndices = alignments.getAlignedSourceIndices(targetWordIndex);
        
        float sum = 0.0f;
        float average;

View Full Code Here

      String binaryVocabFileName = joshDir + "/common.vocab";
      ObjectInput in = BinaryIn.vocabulary(binaryVocabFileName);
    commonVocab.readExternal(in);
    
    String sourceFileName = joshDir + "/source.corpus";
    Corpus sourceCorpusArray = new MemoryMappedCorpusArray(commonVocab, sourceFileName);


    String targetFileName = joshDir + "/target.corpus";
    Corpus targetCorpusArray = new MemoryMappedCorpusArray(commonVocab, targetFileName);
  
    String alignmentFileName = joshDir + "/alignment.grids";
    Alignments alignments = new MemoryMappedAlignmentGrids(alignmentFileName, sourceCorpusArray, targetCorpusArray);
  
    return new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, alignments);

View Full Code Here

    this.initializeStateComputers(symbolTable, JoshuaConfiguration.lmOrder, JoshuaConfiguration.ngramStateID);
    
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading source language corpus from " +
        binarySourceCorpusFileName);
    Corpus sourceCorpusArray =
      new MemoryMappedCorpusArray(
        this.symbolTable, binarySourceCorpusFileName);
    
    
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading source language suffix array from " +
        binarySourceSuffixesFileName);
    Suffixes sourceSuffixArray =
      new MemoryMappedSuffixArray(
          binarySourceSuffixesFileName,
          sourceCorpusArray,
          maxCacheSize);


    
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading target language corpus from " +
        binaryTargetCorpusFileName);
    Corpus targetCorpusArray =
      new MemoryMappedCorpusArray(
        this.symbolTable, binaryTargetCorpusFileName);
    
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading target language suffix array from " +

View Full Code Here

    Vocabulary sourceVocab = new Vocabulary();
    int[] sourceWordsSentences = Vocabulary.initializeVocabulary(sourceFileName, sourceVocab, true);
    numSourceWords = sourceWordsSentences[0];
    numSourceSentences = sourceWordsSentences[1];
    
    Corpus sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceFileName, sourceVocab, numSourceWords, numSourceSentences);
    Suffixes sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, maxCacheSize);
    
    int numTargetWords, numTargetSentences;
    Vocabulary targetVocab = new Vocabulary();
    int[] targetWordsSentences = Vocabulary.initializeVocabulary(targetFileName, targetVocab, true);
    numTargetWords = targetWordsSentences[0];
    numTargetSentences = targetWordsSentences[1];
    
    Corpus targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetFileName, targetVocab, numTargetWords, numTargetSentences);
    Suffixes targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, maxCacheSize);
    
    int trainingSize = sourceCorpusArray.getNumSentences();
    boolean requireTightSpans = true;
    Alignments alignments = new AlignmentGrids(new Scanner(new File(alignmentFileName)), sourceCorpusArray, targetCorpusArray, trainingSize, requireTightSpans);

View Full Code Here

          Vocabulary commonVocab = new Vocabulary();
          ObjectInput in = BinaryIn.vocabulary(binaryVocabFileName);
          commonVocab.readExternal(in);


          logger.fine("Loading source corpus...");
          Corpus sourceCorpus = new MemoryMappedCorpusArray(commonVocab, binarySourceFileName);


          logger.fine("Loading source suffix array...");
          Suffixes sourceSuffixes = new MemoryMappedSuffixArray(binarySourceSuffixesFileName, sourceCorpus);
          
          logger.fine("Loading target corpus...");    
          Corpus targetCorpus = new MemoryMappedCorpusArray(commonVocab, binaryTargetFileName);
          
          logger.fine("Loading target suffix array...");
          Suffixes targetSuffixes = new MemoryMappedSuffixArray(binarySourceSuffixesFileName, sourceCorpus);


          logger.fine("Loading alignment grids...");

View Full Code Here

0 1

TOP

Related Classes of joshua.corpus.Corpus

joshua.corpus.lexprob.LexProbs

joshua.corpus.lexprob.WriteLexProbs

joshua.corpus.suffix_array.FrequentClassesTest

joshua.corpus.suffix_array.FrequentPhrases

joshua.decoder.DecoderThreadTest

joshua.decoder.JoshuaDecoder

joshua.prefix_tree.ExtractRuleProfiler

joshua.prefix_tree.ExtractRules

joshua.prefix_tree.LMAdaptingRuleExtractor

joshua.ui.alignment.GridViewer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.