Package joshua.corpus.vocab

Examples of joshua.corpus.vocab.Vocabulary


        sourcePrintStream.println(sentence);
      }
      sourcePrintStream.close();
      String sourceCorpusFileName = sourceFile.getAbsolutePath();
     
      Vocabulary symbolTable = new Vocabulary();
      int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, symbolTable, true);
      Assert.assertEquals(sourceLengths.length, 2);
      int numberOfSentences = sourceLengths[1];
     
      Corpus sourceCorpus = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, symbolTable, sourceLengths[0], sourceLengths[1]);
View Full Code Here


    this.vocab = vocab;
  }

  public ArpaFile(String arpaFileName) throws IOException {
    this.arpaFile = new File(arpaFileName);
    this.vocab = new Vocabulary();
   
//    final Scanner scanner = new Scanner(arpaFile);
   
//    // Eat initial header lines
//    while (scanner.hasNextLine()) {
View Full Code Here

   
    ////////////////////////////////
    // Common vocabulary          //
    ////////////////////////////////
    if (logger.isLoggable(Level.INFO)) logger.info("Constructing empty common vocabulary");
    Vocabulary commonVocab = new Vocabulary();
    int numSourceWords, numSourceSentences;
    int numTargetWords, numTargetSentences;
    String binaryCommonVocabFileName = this.commonVocabFileName;
    if (binaryCorpus) {
      if (logger.isLoggable(Level.INFO)) logger.info("Initializing common vocabulary from binary file " + binaryCommonVocabFileName);
      ObjectInput in = BinaryIn.vocabulary(binaryCommonVocabFileName);
      commonVocab.readExternal(in);
     
      numSourceWords = Integer.MIN_VALUE;
      numSourceSentences = Integer.MIN_VALUE;
     
      numTargetWords = Integer.MIN_VALUE;
View Full Code Here

        targetSuffixArray, alignments,
        lexProbs, models, sampleSize,
        maxPhraseSpan, maxPhraseLength,
        minNonterminalSpan, maxNonterminalSpan);
   
    SymbolTable vocab = new Vocabulary();
   
    Corpus corpus = suffixArray.getCorpus();
   
    NGramLanguageModel largeLM = new LMGrammarJAVA(
        vocab,
View Full Code Here

    srilm.write_default_vocab_map(tmpFile.getAbsolutePath());
   
   
    // Create a vocabulary object from using the SRILM integer mappings
    Scanner scanner = new Scanner(tmpFile);
    Vocabulary vocab = Vocabulary.getVocabFromSRILM(scanner);
//    vocab.fixVocabulary();
   
   
    // Write the vocabulary to disk in binary format
    ObjectOutput out = new BinaryOut(outVocabFile);
    vocab.writeExternal(out);
   
  }
View Full Code Here

   * @param base a double. The base of the logarithm for quantization.
   */
  private BloomFilterLanguageModel(String filename, int order, int size, double base) {
    super(null, order);
    quantizationBase = base;
    vocabulary = new Vocabulary();
    populateBloomFilter(size, filename);
  }
View Full Code Here

   *
   * @param in an ObjectInput stream to read from
   */
  public void readExternal(ObjectInput in)
  throws IOException, ClassNotFoundException {
    vocabulary = new Vocabulary();
    int vocabSize = in.readInt();
    for (int i = 0; i < vocabSize; i++) {
      String line = in.readUTF();
      vocabulary.addTerminal(line);
    }
View Full Code Here

       
    }

  private static ParallelCorpus getParallelCorpus(String joshDir, int cacheSize) throws IOException, ClassNotFoundException {
   
    Vocabulary commonVocab = new Vocabulary();
      String binaryVocabFileName = joshDir + "/common.vocab";
      ObjectInput in = BinaryIn.vocabulary(binaryVocabFileName);
    commonVocab.readExternal(in);
   
    String sourceFileName = joshDir + "/source.corpus";
    Corpus sourceCorpusArray = new MemoryMappedCorpusArray(commonVocab, sourceFileName);

    String targetFileName = joshDir + "/target.corpus";
View Full Code Here

  SymbolTable vocab;
 
  @Test
  public void setup() {
   
    vocab = new Vocabulary();
    vocab.addTerminal("a");
    vocab.addTerminal("because");
    vocab.addTerminal("boycott");
    vocab.addTerminal("of");
    vocab.addTerminal("parliament");
View Full Code Here

      //      accidentally used anywhere.
     
      if (logger.isLoggable(Level.INFO))
        logger.info("Reading common vocabulary from " +
            binaryVocabFileName);
      Vocabulary commonVocab = new Vocabulary();
      commonVocab.readExternal(
          BinaryIn.vocabulary(binaryVocabFileName));

      // Initialize symbol table using suffix array's vocab
      this.initializeSymbolTable(commonVocab);
    }
View Full Code Here

TOP

Related Classes of joshua.corpus.vocab.Vocabulary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.