Examples of TTable_monolithic_IFAs


Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

        trainer = new Model1_InitUniform(useNullWord);
      } else if (trainerType.equals(MODEL1_TRAINER)) {
        if (usePServer)
          ttable = new PServerClient(pserveHost, pservePort);
        else
          ttable = new TTable_monolithic_IFAs(
              ttfs, ltp, true);

        trainer = new Model1(ttable, useNullWord);
      } else if (trainerType.equals(HMM_TRAINER)) {
        if (usePServer)
          ttable = new PServerClient(pserveHost, pservePort);
        else
          ttable = new TTable_monolithic_IFAs(
              ttfs, ltp, true);
        ATable atable = loadATable(job.getATablePath(), job);
        if (!useNullWord)
          trainer = new HMM(ttable, atable);
        else
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

      //          at.write(dos);
      //          dos.close();
      //          emittedATable = true;
      //        }
      //      }
      TTable tt = new TTable_monolithic_IFAs(fileSys, ttablePath, false);
      boolean emittedATable = false;
      FileStatus[] status = fileSys.listStatus(outputPath);
      for (int i=0; i<status.length; i++){
        sLogger.info("Reading " + status[i].getPath() + ", exists? " + fileSys.exists(status[i].getPath()));
        SequenceFile.Reader reader = new SequenceFile.Reader(xjob, SequenceFile.Reader.file(status[i].getPath()));
        while (reader.next(k, t)){
          if (t.getType() == PartialCountContainer.CONTENT_ARRAY) {
            tt.set(k.get(), (IndexedFloatArray)t.getContent());
            if (k.get() % 1000 == 0) reporter.progress();
            reporter.incrCounter(MergeCounters.EWORDS, 1);
            reporter.incrCounter(MergeCounters.STATISTICS, ((IndexedFloatArray)t.getContent()).size() + 1);
          } else {
            if (emittedATable)
              throw new RuntimeException("Should only have a single ATable!");
            ATable at = (ATable)t.getContent();
            fileSys.delete(atablePath, true);
            DataOutputStream dos = new DataOutputStream(
                new BufferedOutputStream(fileSys.create(atablePath)));
            at.write(dos);
            dos.close();
            emittedATable = true;
          }
        }
        reader.close();
      }
      fileSys.delete(ttablePath, true); // delete old ttable
      tt.write()// write new one to same location
    }
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

      // the following is a race condition
      fileSys.delete(outputPath.suffix("/_logs"), true);
      SequenceFile.Reader[] readers =
        SequenceFileOutputFormat.getReaders(xjob, outputPath);
      FileReaderZip z = new FileReaderZip(readers);
      TTable tt = new TTable_monolithic_IFAs(fileSys, ttablePath, false);
      boolean emittedATable = false;
      while (z.next(k,t)) {
        if (t.getType() == PartialCountContainer.CONTENT_ARRAY) {
          tt.set(k.get(), (IndexedFloatArray)t.getContent());
          if (k.get() % 1000 == 0) reporter.progress();
          reporter.incrCounter(MergeCounters.EWORDS, 1);
          reporter.incrCounter(MergeCounters.STATISTICS, ((IndexedFloatArray)t.getContent()).size() + 1);
        } else {
          if (emittedATable)
            throw new RuntimeException("Should only have a single ATable!");
          ATable at = (ATable)t.getContent();
          fileSys.delete(atablePath, true);
          DataOutputStream dos = new DataOutputStream(
              new BufferedOutputStream(fileSys.create(atablePath)));
          at.write(dos);
          dos.close();
          emittedATable = true;
        }
      }
      fileSys.delete(ttablePath, true); // delete old ttable
      tt.write()// write new one to same location
      output.collect(key, value);
    }
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

          throw new RuntimeException("Error loading Terms File for dictionary from "+localFiles[0]);
        }    

        eVocabTrg = HadoopAlign.loadVocab(pathMapping.get("Ivory.E_Vocab_F2E"), fs);
        fVocabSrc = HadoopAlign.loadVocab(pathMapping.get("Ivory.F_Vocab_F2E"), fs);
        f2e_Probs = new TTable_monolithic_IFAs(fs, pathMapping.get("Ivory.TTable_F2E"), true);

        eVocabSrc = HadoopAlign.loadVocab(pathMapping.get("Ivory.E_Vocab_E2F"), fs);
        fVocabTrg = HadoopAlign.loadVocab(pathMapping.get("Ivory.F_Vocab_E2F"), fs);
        e2f_Probs = new TTable_monolithic_IFAs(fs, pathMapping.get("Ivory.TTable_E2F"), true);

        String tokenizerModel = pathMapping.get(Constants.TargetTokenizer).toString();
        String stopwordsFile = pathMapping.get(Constants.TargetStopwordList).toString();
        String stemmedStopwordsFile = pathMapping.get(Constants.TargetStemmedStopwordList).toString();      
        tokenizer = TokenizerFactory.createTokenizer(fs, conf.get(Constants.TargetLanguage),
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

      if(!fs2.exists(new Path(fFile)) || !fs2.exists(new Path(eFile)) || !fs2.exists(new Path(e2fttableFile)) || !fs2.exists(new Path(termsFile)) || !fs2.exists(new Path(dfByIntFile))){
        throw new RuntimeException("Error: Translation files do not exist!");
      }

      Vocab eVocab_e2f = null, fVocab_e2f = null;
      TTable_monolithic_IFAs en2DeProbs = null;
      try {
        eVocab_e2f = HadoopAlign.loadVocab(new Path(eFile), conf);
        fVocab_e2f = HadoopAlign.loadVocab(new Path(fFile), conf);

        en2DeProbs = new TTable_monolithic_IFAs(fs2, new Path(e2fttableFile), true);
      } catch (IOException e) {
        e.printStackTrace();
     

      DefaultFrequencySortedDictionary dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs2);
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

    eVocabSrc = (VocabularyWritable) HadoopAlign.loadVocab(pathMapping.get(eVocabSrcFile), localFs);
    eVocabTrg = (VocabularyWritable) HadoopAlign.loadVocab(pathMapping.get(eVocabTrgFile), localFs);
    fVocabSrc = (VocabularyWritable) HadoopAlign.loadVocab(pathMapping.get(fVocabSrcFile), localFs);
    fVocabTrg = (VocabularyWritable) HadoopAlign.loadVocab(pathMapping.get(fVocabTrgFile), localFs);        
    f2e_Probs = new TTable_monolithic_IFAs(localFs, pathMapping.get(f2e_ttableFile), true);
    e2f_Probs = new TTable_monolithic_IFAs(localFs, pathMapping.get(e2f_ttableFile), true);

    // tokenizer file not read from cache, since it might be a directory (e.g. Chinese segmenter)
    String tokenizerFile = conf.get("fTokenizer");
    fTok = TokenizerFactory.createTokenizer(fs, fLang, tokenizerFile, true, conf.get("fStopword"), conf.get("fStemmedStopword"), null);
    sLogger.info("Tokenizer and vocabs created successfully from " + fLang + " " + tokenizerFile + "," + conf.get("fStopword") + "," + conf.get("fStemmedStopword"));
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

    eVocabSrc = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("eVocabSrcFile")), localFs);
    eVocabTrg = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("eVocabTrgFile")), localFs);
    fVocabSrc = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("fVocabSrcFile")), localFs);
    fVocabTrg = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("fVocabTrgFile")), localFs);        
    f2e_Probs = new TTable_monolithic_IFAs(localFs, new Path(conf.get("f2e_ttableFile")), true);
    e2f_Probs = new TTable_monolithic_IFAs(localFs, new Path(conf.get("e2f_ttableFile")), true);

    // tokenizer file not read from cache, since it might be a directory (e.g. Chinese segmenter)
    String tokenizerFile = conf.get("fTokenizer");
    fTok = TokenizerFactory.createTokenizer(localFs, fLang, tokenizerFile, true, conf.get("fStopword"), null, null);
    sLogger.info("Tokenizer and vocabs created successfully.");
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

      }
      eVocabSrc = HadoopAlign.loadVocab(new Path(eVocabSrcFile), localFs);
      eVocabTrg = HadoopAlign.loadVocab(new Path(eVocabTrgFile), localFs);
      fVocabSrc = HadoopAlign.loadVocab(new Path(fVocabSrcFile), localFs);
      fVocabTrg = HadoopAlign.loadVocab(new Path(fVocabTrgFile), localFs);
      f2e_Probs = new TTable_monolithic_IFAs(localFs, new Path(probTablef2eFile), true);
      e2f_Probs = new TTable_monolithic_IFAs(localFs, new Path(probTablee2fFile), true);
      Tokenizer fTokenizer = TokenizerFactory.createTokenizer(localFs, fLang, fTokenFile, false);
      Tokenizer eTokenizer = TokenizerFactory.createTokenizer(localFs, eLang, eTokenFile, false);
      long startTime = System.currentTimeMillis();

      if (pairsFile == null) {
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

    FileSystem localFs = FileSystem.getLocal(new Configuration());
    eVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.en"), localFs);
    eVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.en"), localFs);
    fVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.de"), localFs);
    fVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.de"), localFs);
    f2e_Probs = new TTable_monolithic_IFAs(localFs, new Path(VOCABDIR+"/ttable.de-en"), true);
    e2f_Probs = new TTable_monolithic_IFAs(localFs, new Path(VOCABDIR+"/ttable.en-de"), true);
    dt.readSentences(10,
        DATADIR+"/europarl-v6.sample.de",
        DATADIR+"/europarl-v6.sample.de", "en", "de",
        TOKENDIR+"/de-token.bin",
        TOKENDIR+"/en-token.bin",
View Full Code Here

Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

   * @throws IOException
   */
  public static void createTTableFromBerkeleyAligner(String inputFile, String srcVocabFile, String trgVocabFile, String probsFile, float probThreshold, int numTrans, FileSystem fs) throws IOException{
    logger.setLevel(Level.INFO);

    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
    File file = new File(inputFile);
    FileInputStream fis = null;
    BufferedReader bis = null;
    //    int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0
    int cnt = 0;    // for statistical purposes only
    //    float sumCumProbs = 0f;                      // for statistical purposes only
    HookaStats stats = new HookaStats(numTrans, probThreshold);

    //In BerkeleyAligner output, dictionary entries of each source term are already sorted by prob. value.
    try {
      fis = new FileInputStream(file);

      bis = new BufferedReader(new InputStreamReader(fis,"UTF-8"));
      String cur = null;
      boolean earlyTerminate = false;
      String line = "";
      while (true) {
        if(!earlyTerminate){
          line = bis.readLine();
          if(line ==null)
            break;
          cnt++;
        }
        earlyTerminate = false;
        logger.debug("Line:"+line);

        Pattern p = Pattern.compile("(.+)\\tentropy .+nTrans");
        Matcher m = p.matcher(line);
        if(m.find()){
          cur = m.group(1);

          int gerIndex = srcVocab.addOrGet(cur)
          logger.debug("Found: "+cur+" with index: "+gerIndex);


          List<PairOfIntFloat> indexProbPairs = new ArrayList<PairOfIntFloat>();
          float sumOfProbs = 0.0f;
          for(int i=0;i<numTrans;i++){
            if((line=bis.readLine())!=null){
              cnt++;
              Pattern p2 = Pattern.compile("\\s*(\\S+): (.+)");
              Matcher m2 = p2.matcher(line);
              if(!m2.find()){
                m = p.matcher(line);
                if(m.find()){
                  logger.debug("Early terminate");
                  earlyTerminate = true;
                  i = numTrans;
                  break;
                }
                //                logger.debug("FFFF"+line);
              }else{
                String term = m2.group(1);
                if (!term.equals("NULL")) {
                  float prob = Float.parseFloat(m2.group(2));
                  int engIndex = trgVocab.addOrGet(term);
                  logger.debug("Added: "+term+" with index: "+engIndex+" and prob:"+prob);
                  indexProbPairs.add(new PairOfIntFloat(engIndex, prob));
                  sumOfProbs+=prob;
                }
              }
            }
            if(sumOfProbs > probThreshold){
              stats.incCntShortTail(1);
              stats.incSumShortTail(i+1);
              break;
            }
          }
          if(sumOfProbs <= probThreshold){
            // early termination
            stats.incCntLongTail(1);
            stats.incSumCumProbs(sumOfProbs);
          }

          // to enable faster access with binary search, we sort entries by vocabulary index.
          Collections.sort(indexProbPairs);
          int i=0;
          int numEntries = indexProbPairs.size();
          int[] indices = new int[numEntries];
          float[] probs = new float[numEntries];
          for(PairOfIntFloat pair : indexProbPairs){
            indices[i] = pair.getLeftElement();
            probs[i++] = pair.getRightElement()/sumOfProbs;
          }
          table.set(gerIndex, new IndexedFloatArray(indices, probs, true));
        }
      }

      // dispose all the resources after using them.
      fis.close();
      bis.close();
      //      dis.close();
    }catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
    logger.info("File "+inputFile+": read "+cnt+" lines");
    logger.info("Vocabulary Target: "+trgVocab.size()+" elements");
    logger.info("Vocabulary Source: "+srcVocab.size()+" elements");
    logger.info(stats);

    DataOutputStream dos = new DataOutputStream(new BufferedOutputStream
        (fs.create(new Path(trgVocabFile))));
    ((VocabularyWritable) trgVocab).write(dos);
    dos.close();
    DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream
        (fs.create(new Path(srcVocabFile))));
    ((VocabularyWritable) srcVocab).write(dos2);
    dos2.close();
    DataOutputStream dos3 = new DataOutputStream(new BufferedOutputStream
        (fs.create(new Path(probsFile))));
    table.write(dos3);
    dos3.close();
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.