Package joshua.corpus.vocab

Examples of joshua.corpus.vocab.Vocabulary



  // TODO: fuse createVocabulary and createCorpusArray together to avoid allocating the trivial array.
  public static CorpusArray createCorpusArray(String inputFilename)
  throws IOException {
    Vocabulary vocabulary = new Vocabulary();
    int[] ws = Vocabulary.initializeVocabulary(inputFilename, vocabulary, true);
    return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
  }
View Full Code Here


  int numBuiltInTerminals = 4;
 
  @Test
  public void basicVocabTest() {
   
    Vocabulary vocab1 = new Vocabulary();
    Vocabulary vocab2 = new Vocabulary(new HashSet<String>());
   
    Assert.assertEquals(vocab1, vocab2);
   
    Assert.assertFalse(vocab1.intToString.isEmpty());
//    Assert.assertTrue(vocab1.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING);
    Assert.assertFalse(vocab1.getWords().isEmpty());
    Assert.assertTrue(vocab1.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING);
    Assert.assertEquals(vocab1.getWords(), vocab1.intToString.values());

    Assert.assertEquals(vocab1.size(), numBuiltInSymbols);
    Assert.assertEquals(vocab1.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING);

    //Assert.assertEquals(vocab1.getID("sample"), Vocabulary.UNKNOWN_WORD);
    //Assert.assertEquals(vocab1.getID(null), Vocabulary.UNKNOWN_WORD);

    Assert.assertFalse(vocab1.terminalToInt.isEmpty());
    Assert.assertEquals(vocab1.terminalToInt.size(), this.numBuiltInTerminals);
//    Assert.assertFalse(vocab1.isFixed);
//   
//    vocab1.fixVocabulary();
//    Assert.assertTrue(vocab1.isFixed);
   
    Assert.assertEquals(vocab1.getID(SymbolTable.X_STRING), -1);
    Assert.assertEquals(vocab1.getID(SymbolTable.X1_STRING), -2);
    Assert.assertEquals(vocab1.getID(SymbolTable.X2_STRING), -3);
   
    Assert.assertEquals(vocab1.getWord(-1), SymbolTable.X_STRING);
    Assert.assertEquals(vocab1.getWord(-2), SymbolTable.X1_STRING);
    Assert.assertEquals(vocab1.getWord(-3), SymbolTable.X2_STRING);
   
   
   
    Assert.assertFalse(vocab2.intToString.isEmpty());
//    Assert.assertTrue(vocab2.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING);
    Assert.assertFalse(vocab2.getWords().isEmpty());
//    Assert.assertTrue(vocab2.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING);
    Assert.assertEquals(vocab2.getWords(), vocab2.intToString.values());

    Assert.assertEquals(vocab2.size(), numBuiltInSymbols);
    Assert.assertEquals(vocab2.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING);

//    Assert.assertEquals(vocab2.getID("sample"), Vocabulary.UNKNOWN_WORD);
//    Assert.assertEquals(vocab2.getID(null), Vocabulary.UNKNOWN_WORD);
   
    Assert.assertFalse(vocab2.terminalToInt.isEmpty());
View Full Code Here

      sourcePrintStream.println(corpusString);
      sourcePrintStream.close();
      sourceFileName = sourceFile.getAbsolutePath();
    }
   
    Vocabulary vocab = new Vocabulary();
    Vocabulary.initializeVocabulary(sourceFileName, vocab, true);
   
    Assert.assertEquals(vocab.getWord(vocab.getID("it")), "it");
    Assert.assertEquals(vocab.getWord(vocab.getID("makes")), "makes");
    Assert.assertEquals(vocab.getWord(vocab.getID("him")), "him");
    Assert.assertEquals(vocab.getWord(vocab.getID("and")), "and");
    Assert.assertEquals(vocab.getWord(vocab.getID("mars")), "mars");
    Assert.assertEquals(vocab.getWord(vocab.getID(",")), ",");
    Assert.assertEquals(vocab.getWord(vocab.getID("sets")), "sets");
    Assert.assertEquals(vocab.getWord(vocab.getID("on")), "on");
    Assert.assertEquals(vocab.getWord(vocab.getID("takes")), "takes");
    Assert.assertEquals(vocab.getWord(vocab.getID("off")), "off");
   
//    Assert.assertEquals(vocab.getWord(vocab.getID("persuades")), Vocabulary.UNKNOWN_WORD_STRING);
//    Assert.assertEquals(vocab.getWord(vocab.getID("disheartens")), Vocabulary.UNKNOWN_WORD_STRING);
  }
View Full Code Here

    String filename = "data/tiny.en";
    int numSentences = 5// Should be 5 sentences in tiny.en
    int numWords = 89;     // Should be 89 words in tiny.en
    int numUniqWords = 60; // Should be 60 unique words in tiny.en
   
    Vocabulary vocab = new Vocabulary();
    Vocabulary vocab2 = new Vocabulary();
   
    Assert.assertTrue(vocab.equals(vocab2));
    Assert.assertTrue(vocab2.equals(vocab));
    Assert.assertEquals(vocab, vocab2);
   
    try {
      int[] result = Vocabulary.initializeVocabulary(filename, vocab, true);
      Assert.assertNotNull(result);
      Assert.assertEquals(result.length, 2);
      Assert.assertEquals(result[0], numWords);
      Assert.assertEquals(result[1], numSentences)
     
//      Assert.assertTrue(vocab.isFixed);
      Assert.assertEquals(vocab.size(), numUniqWords+numBuiltInSymbols);
     
    } catch (IOException e) {
      Assert.fail("Could not load file " + filename);
    }
   
    Assert.assertFalse(vocab.equals(vocab2));
   
    try {
      int[] result = Vocabulary.initializeVocabulary(filename, vocab2, true);
      Assert.assertNotNull(result);
      Assert.assertEquals(result.length, 2);
      Assert.assertEquals(result[0], numWords);
      Assert.assertEquals(result[1], numSentences)
     
//      Assert.assertTrue(vocab2.isFixed);
      Assert.assertEquals(vocab2.size(), numUniqWords+numBuiltInSymbols);
     
    } catch (IOException e) {
      Assert.fail("Could not load file " + filename);
    }
   
View Full Code Here

        sourcePrintStream.println(sentence);
      }
      sourcePrintStream.close();
      String corpusFileName = sourceFile.getAbsolutePath();
     
      Vocabulary symbolTable;
     
      logger.fine("Constructing vocabulary from file " + corpusFileName);
      symbolTable = new Vocabulary();
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);

      logger.fine("Constructing corpus array from file " + corpusFileName);
      Corpus corpus = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);
View Full Code Here

   
   
    try {
     
      // FIX: can't use createVocabulary(String) because we set numWords and numSentences
      Vocabulary vocab = new Vocabulary();
      Vocabulary.initializeVocabulary(filename, vocab, true);
      CorpusArray corpus = SuffixArrayFactory.createCorpusArray(filename, vocab, numWords, numSentences);
     
      corpus.write(filename+".corpus", filename+".vocab", "UTF-8");
     
View Full Code Here

TOP

Related Classes of joshua.corpus.vocab.Vocabulary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.