Package joshua.corpus.vocab

Examples of joshua.corpus.vocab.Vocabulary


          String binaryTargetFileName = joshDirName + File.separator + "target.corpus";
//          String binaryTargetSuffixesFileName = joshDirName + File.separator + "target.suffixes";
          String binaryAlignmentFileName = joshDirName + File.separator + "alignment.grids";

          logger.fine("Loading vocabulary...");
          Vocabulary commonVocab = new Vocabulary();
          ObjectInput in = BinaryIn.vocabulary(binaryVocabFileName);
          commonVocab.readExternal(in);

          logger.fine("Loading source corpus...");
          Corpus sourceCorpus = new MemoryMappedCorpusArray(commonVocab, binarySourceFileName);

          logger.fine("Loading source suffix array...");
View Full Code Here


  //===============================================================
 
  public static void main(String[] args) throws IOException, ClassNotFoundException {


    Vocabulary symbolTable;
    Corpus corpusArray;
    Suffixes suffixArray;
    FrequentPhrases frequentPhrases;

    if (args.length == 1) {

      String corpusFileName = args[0];

      logger.info("Constructing vocabulary from file " + corpusFileName);
      symbolTable = new Vocabulary();
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);

      logger.info("Constructing corpus array from file " + corpusFileName);
      corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);

      logger.info("Constructing suffix array from file " + corpusFileName);
      suffixArray = new SuffixArray(corpusArray, Cache.DEFAULT_CAPACITY);

    } else if (args.length == 3) {

      String binarySourceVocabFileName = args[0];
      String binaryCorpusFileName = args[1];
      String binarySuffixArrayFileName = args[2];

      if (logger.isLoggable(Level.INFO)) logger.info("Constructing source language vocabulary from binary file " + binarySourceVocabFileName);
      ObjectInput in = BinaryIn.vocabulary(binarySourceVocabFileName);
      symbolTable = new Vocabulary();
      symbolTable.readExternal(in);

      logger.info("Constructing corpus array from file " + binaryCorpusFileName);
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing memory mapped source language corpus array.");
      corpusArray = new MemoryMappedCorpusArray(symbolTable, binaryCorpusFileName);
View Full Code Here

      for (String sentence : to_be_or_not_to_be) {
        String[] array = sentence.split("\\s+");
        Arrays.sort(array);
        for (String s : array) { set.add(s); }
      }
      symbolTableToBe = new Vocabulary(set);
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, new Vocabulary(), true);

      logger.fine("Constructing corpus array from file " + corpusFileName);
      corpusToBe = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTableToBe, lengths[0], lengths[1]);

      logger.fine("Constructing suffix array from file " + corpusFileName);
View Full Code Here

        sourcePrintStream.println(sentence);
      }
      sourcePrintStream.close();
      corpusFileName = sourceFile.getAbsolutePath();
     
      Vocabulary symbolTable;
     
      logger.fine("Constructing vocabulary from file " + corpusFileName);
      ArrayList<String> words = new ArrayList<String>();
      for (String sentence : sentences) {
        String[] array = sentence.split("\\s+");
        for (String s : array) {
          if (! words.contains(s)) {
            words.add(s);
          }
        }
      }
      Collections.sort(words);
      LinkedHashSet<String> set = new LinkedHashSet<String>(words);
      symbolTable = new Vocabulary(set);
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, new Vocabulary(), true);

      logger.fine("Constructing corpus array from file " + corpusFileName);
      corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);

      logger.fine("Constructing suffix array from file " + corpusFileName);
View Full Code Here

    // Tell System.out and System.err to use UTF8
    FormatUtil.useUTF8();
 
    try {
     
      Vocabulary symbolTable;
      Corpus corpusArray;
      Suffixes suffixArray;
     
      logger.fine("Constructing vocabulary from file " + corpusFileName);
      symbolTable = new Vocabulary();
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);

      logger.fine("Constructing corpus array from file " + corpusFileName);
      corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);
View Full Code Here

  public SuffixArrayTest(String binaryFileName) throws IOException, ClassNotFoundException {
   
    // Adam Lopez's example...
    String corpusString = "it makes him and it mars him , it sets him on and it takes him off .";

    vocab = new Vocabulary();
    Phrase exampleSentence = new BasicPhrase(corpusString, vocab);
   
    exampleSentence = new BasicPhrase(corpusString, vocab);
    int[] sentences = new int[1];
    sentences[0] = 0;
View Full Code Here

public class AbstractHierarchicalPhrasesTest {

  @Test
  public void equalityTest() {
 
    Vocabulary vocab = new Vocabulary();
    vocab.addNonterminal("X");
    vocab.addTerminal("en");
    vocab.addTerminal("de");
    int X = vocab.getNonterminalID("X");
    int en = vocab.getID("en");
    int de = vocab.getID("de");
   
    int[] M_a_alpha_startPositions = {25,30,27,30};
    int[] M_a_alpha_sentenceNumbers = {2,2};
    Pattern M_a_alpha_pattern = new Pattern(vocab, en, X, de, X);
    Assert.assertEquals(M_a_alpha_pattern.arity(),2);
View Full Code Here

  }
 
  @Test
  public void queryIntersectTest() {
   
    Vocabulary vocab = new Vocabulary();
    vocab.addNonterminal("X");
    vocab.addTerminal("en");
    vocab.addTerminal("de");
    int X = vocab.getNonterminalID("X");
    int en = vocab.getID("en");
    int de = vocab.getID("de");
   
    int[] M_a_alpha_startPositions = {25,30,27,30};
    int[] M_a_alpha_sentenceNumbers = {2,2};
    Pattern M_a_alpha_pattern = new Pattern(vocab, en, X, de, X);
    Assert.assertEquals(M_a_alpha_pattern.arity(),2);
View Full Code Here

      logger.warning("By convention, the output directory should end in .josh");
    }
   
   
    // Construct common vocabulary
    Vocabulary symbolTable = new Vocabulary();
    if (logger.isLoggable(Level.INFO)) logger.info("Adding terminal tokens from file " + sourceCorpusFileName + " to common vocabulary");
    int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, symbolTable, false);
    if (logger.isLoggable(Level.INFO)) logger.info("Adding terminal tokens from file " + targetCorpusFileName + " to common vocabulary");
    int[] targetLengths = Vocabulary.initializeVocabulary(targetCorpusFileName, symbolTable, true);
   
    if (sourceLengths[1] != targetLengths[1]) {
      logger.severe("Source corpus and target corpus have different number of sentences (" + sourceLengths[1] + " vs " + targetLengths[1] + ")");
      System.exit(-3);
    }
    int numberOfSentences = sourceLengths[1];
   
    // Write README file to disk
    String readmeFilename = outputDirName + File.separator + "README.txt";
    PrintStream out = new PrintStream(readmeFilename);

    out.println("This directory contains the following binary files:");
    out.println();

   
   
    // Write vocabulary to disk
    {
      String binaryVocabFilename = outputDirName + File.separator + "common.vocab";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary common vocabulary to disk at " + binaryVocabFilename);
     
      ObjectOutput vocabOut =
          new BinaryOut(new FileOutputStream(binaryVocabFilename), true);
      symbolTable.setExternalizableEncoding(charset);
        symbolTable.writeExternal(vocabOut);
        vocabOut.flush();
       
      out.println("Common symbol table for source and target language: " + binaryVocabFilename);
    }
   
View Full Code Here

    String binaryCorpusFilename = args[2];
    String charset = (args.length > 3) ? args[3] : "UTF-8";
   
    // Read the provided symbol table
    logger.info("Reading provided symbol table");
    Vocabulary symbolTable = new Vocabulary();
    ObjectInput in = BinaryIn.vocabulary(binaryVocabFilename);
    symbolTable.readExternal(in);
   
    // Read the provided corpus
    logger.info("Reading provided corpus");
    Vocabulary oldSymbolTable = new Vocabulary();
    int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, oldSymbolTable, true);
    CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, oldSymbolTable, lengths[0], lengths[1]);
   
    // Change the internal integer-string mappings
    // of the corpus to use those provided by the given symbol table.
View Full Code Here

TOP

Related Classes of joshua.corpus.vocab.Vocabulary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.