Package joshua.corpus.vocab

Source Code of joshua.corpus.vocab.VocabularyTest

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package joshua.corpus.vocab;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import java.util.HashSet;

import joshua.corpus.vocab.Vocabulary;


import org.testng.Assert;
import org.testng.annotations.Test;



/**
*
*
* @author Lane Schwartz
*/
public class VocabularyTest {

  /** [X], [X,1], [X,2], [S], [S,1] <unk>, <s>, </s>, -pau-*/
  int numBuiltInSymbols = 9;
 
  /** <unk>, <s>, </s>, -pau- */
  int numBuiltInTerminals = 4;
 
  @Test
  public void basicVocabTest() {
   
    Vocabulary vocab1 = new Vocabulary();
    Vocabulary vocab2 = new Vocabulary(new HashSet<String>());
   
    Assert.assertEquals(vocab1, vocab2);
   
    Assert.assertFalse(vocab1.intToString.isEmpty());
//    Assert.assertTrue(vocab1.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING);
    Assert.assertFalse(vocab1.getWords().isEmpty());
    Assert.assertTrue(vocab1.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING);
    Assert.assertEquals(vocab1.getWords(), vocab1.intToString.values());

    Assert.assertEquals(vocab1.size(), numBuiltInSymbols);
    Assert.assertEquals(vocab1.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING);

    //Assert.assertEquals(vocab1.getID("sample"), Vocabulary.UNKNOWN_WORD);
    //Assert.assertEquals(vocab1.getID(null), Vocabulary.UNKNOWN_WORD);

    Assert.assertFalse(vocab1.terminalToInt.isEmpty());
    Assert.assertEquals(vocab1.terminalToInt.size(), this.numBuiltInTerminals);
//    Assert.assertFalse(vocab1.isFixed);
//   
//    vocab1.fixVocabulary();
//    Assert.assertTrue(vocab1.isFixed);
   
    Assert.assertEquals(vocab1.getID(SymbolTable.X_STRING), -1);
    Assert.assertEquals(vocab1.getID(SymbolTable.X1_STRING), -2);
    Assert.assertEquals(vocab1.getID(SymbolTable.X2_STRING), -3);
   
    Assert.assertEquals(vocab1.getWord(-1), SymbolTable.X_STRING);
    Assert.assertEquals(vocab1.getWord(-2), SymbolTable.X1_STRING);
    Assert.assertEquals(vocab1.getWord(-3), SymbolTable.X2_STRING);
   
   
   
    Assert.assertFalse(vocab2.intToString.isEmpty());
//    Assert.assertTrue(vocab2.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING);
    Assert.assertFalse(vocab2.getWords().isEmpty());
//    Assert.assertTrue(vocab2.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING);
    Assert.assertEquals(vocab2.getWords(), vocab2.intToString.values());

    Assert.assertEquals(vocab2.size(), numBuiltInSymbols);
    Assert.assertEquals(vocab2.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING);

//    Assert.assertEquals(vocab2.getID("sample"), Vocabulary.UNKNOWN_WORD);
//    Assert.assertEquals(vocab2.getID(null), Vocabulary.UNKNOWN_WORD);
   
    Assert.assertFalse(vocab2.terminalToInt.isEmpty());
    Assert.assertEquals(vocab2.terminalToInt.size(), this.numBuiltInTerminals);
//    Assert.assertTrue(vocab2.isFixed);
   


  }

  @Test
  public void verifyWordIDs() throws IOException {
   
    // Adam Lopez's example...
    String corpusString = "it makes him and it mars him , it sets him on and it takes him off .";
//    String queryString = "it persuades him and it disheartens him";
   
    String sourceFileName;
    {
      File sourceFile = File.createTempFile("source", new Date().toString());
      PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8");
      sourcePrintStream.println(corpusString);
      sourcePrintStream.close();
      sourceFileName = sourceFile.getAbsolutePath();
    }
   
    Vocabulary vocab = new Vocabulary();
    Vocabulary.initializeVocabulary(sourceFileName, vocab, true);
   
    Assert.assertEquals(vocab.getWord(vocab.getID("it")), "it");
    Assert.assertEquals(vocab.getWord(vocab.getID("makes")), "makes");
    Assert.assertEquals(vocab.getWord(vocab.getID("him")), "him");
    Assert.assertEquals(vocab.getWord(vocab.getID("and")), "and");
    Assert.assertEquals(vocab.getWord(vocab.getID("mars")), "mars");
    Assert.assertEquals(vocab.getWord(vocab.getID(",")), ",");
    Assert.assertEquals(vocab.getWord(vocab.getID("sets")), "sets");
    Assert.assertEquals(vocab.getWord(vocab.getID("on")), "on");
    Assert.assertEquals(vocab.getWord(vocab.getID("takes")), "takes");
    Assert.assertEquals(vocab.getWord(vocab.getID("off")), "off");
   
//    Assert.assertEquals(vocab.getWord(vocab.getID("persuades")), Vocabulary.UNKNOWN_WORD_STRING);
//    Assert.assertEquals(vocab.getWord(vocab.getID("disheartens")), Vocabulary.UNKNOWN_WORD_STRING);
  }
 
  @Test
  public void loadVocabFromFile() {
   
    String filename = "data/tiny.en";
    int numSentences = 5// Should be 5 sentences in tiny.en
    int numWords = 89;     // Should be 89 words in tiny.en
    int numUniqWords = 60; // Should be 60 unique words in tiny.en
   
    Vocabulary vocab = new Vocabulary();
    Vocabulary vocab2 = new Vocabulary();
   
    Assert.assertTrue(vocab.equals(vocab2));
    Assert.assertTrue(vocab2.equals(vocab));
    Assert.assertEquals(vocab, vocab2);
   
    try {
      int[] result = Vocabulary.initializeVocabulary(filename, vocab, true);
      Assert.assertNotNull(result);
      Assert.assertEquals(result.length, 2);
      Assert.assertEquals(result[0], numWords);
      Assert.assertEquals(result[1], numSentences)
     
//      Assert.assertTrue(vocab.isFixed);
      Assert.assertEquals(vocab.size(), numUniqWords+numBuiltInSymbols);
     
    } catch (IOException e) {
      Assert.fail("Could not load file " + filename);
    }
   
    Assert.assertFalse(vocab.equals(vocab2));
   
    try {
      int[] result = Vocabulary.initializeVocabulary(filename, vocab2, true);
      Assert.assertNotNull(result);
      Assert.assertEquals(result.length, 2);
      Assert.assertEquals(result[0], numWords);
      Assert.assertEquals(result[1], numSentences)
     
//      Assert.assertTrue(vocab2.isFixed);
      Assert.assertEquals(vocab2.size(), numUniqWords+numBuiltInSymbols);
     
    } catch (IOException e) {
      Assert.fail("Could not load file " + filename);
    }
   
    Assert.assertEquals(vocab, vocab2);
  }
}
TOP

Related Classes of joshua.corpus.vocab.VocabularyTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.