Package ivory.core

Examples of ivory.core.RetrievalEnvironment.readCollectionDocumentCount()


      sLogger.info("reading stats for " + index);

      RetrievalEnvironment env = new RetrievalEnvironment(index, fs);

      long l = env.readCollectionLength();
      int n = env.readCollectionDocumentCount();

      sLogger.info(" - CollectionLength: " + l);
      sLogger.info(" - CollectionDocumentCount: " + n);

      collectionLength += l;
View Full Code Here


    }
    LOG.info(mDLTable.getAvgDocLength()+" is average doc len.");
    LOG.info(mDLTable.getDocCount()+" is num docs.");

    conf.setFloat("Ivory.AvgDocLen", mDLTable.getAvgDocLength());
    conf.setInt("Ivory.CollectionDocumentCount", env.readCollectionDocumentCount());
   
    conf.setNumMapTasks(300);     
    conf.setNumReduceTasks(0);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.map.max.attempts", 10);
View Full Code Here

    FileSystem fs = FileSystem.get(job2);
   
    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    int blockSize = getConf().getInt("Ivory.BlockSize", 0);   
    int numDocs = env.readCollectionDocumentCount();
    int numBlocks = numDocs / blockSize + 1;

    String inputPath = null;
    for (int i = 0; i < numBlocks; i++) {
      inputPath = conf.get("Ivory.PCPOutputPath")+"/block"+i;      //one block of output of PCP algorithm
View Full Code Here

    config.set("Ivory.CollectionName", targetEnv.readCollectionName()+"_"+srcEnv.readCollectionName());
    config.set("Ivory.IndexPath", targetLangDir);

    // collection size is the sum of the two collections' sizes
    int collSize = targetEnv.readCollectionDocumentCount()+srcEnv.readCollectionDocumentCount();
    config.setInt("Ivory.CollectionDocumentCount", collSize);

    ///////Parameters/////////////
    numOfBits = Integer.parseInt(args[2]);
    signatureType = args[3].toLowerCase();
View Full Code Here

    int batchSize = -1;
    try {
      if(batchSizeGiven){
        batchSize = Integer.parseInt(args[3]);
        if(batchSize>0){
          int numDocs = env.readCollectionDocumentCount();
          numBatchFiles = numDocs / batchSize;
          if(numDocs % batchSize > 0) numBatchFiles++;
          System.out.println("numBatchFiles: "+numBatchFiles);
          config.setInt("NumBatch", numBatchFiles);
        }
View Full Code Here

      BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(conf);
      LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");

      int finalNumDocs = weightedIntVectorsTool.run();
      if(finalNumDocs > 0){
        LOG.info("Changed doc count from "+env.readCollectionDocumentCount() + " to = "+finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
View Full Code Here

    if (sampleDocnosFile != null && fs.exists(new Path(sampleDocnosFile))) {
      job.set("Ivory.SampleFile", sampleDocnosFile);
      DistributedCache.addCacheFile(new URI(sampleDocnosFile), job);
    } else if (sampleSize != -1) {
      RetrievalEnvironment env = new RetrievalEnvironment(workDir, fs);
      int collectionSize = env.readCollectionDocumentCount();
      sampleFreq = collectionSize / (float) sampleSize;
      job.setInt("SampleFrequency", (int) sampleFreq);
    } else {
      throw new RuntimeException("Either provide sample frequency with " +
          "option -" + SAMPLESIZE_OPTION+ " or existing sample docnos with option -" + SAMPLEDOCNOS_OPTION);
View Full Code Here

    PwsimEnvironment.setClassTypes(signatureType, config);
    int batchSize = -1;
    if (cmdline.hasOption(BATCH_OPTION)) {
      batchSize = Integer.parseInt(cmdline.getOptionValue(BATCH_OPTION));
      if (batchSize > 0) {
        int numDocs = env.readCollectionDocumentCount();
        numBatchFiles = numDocs / batchSize;
        if(numDocs % batchSize > 0) numBatchFiles++;
        System.out.println("numBatchFiles: "+numBatchFiles);
        config.setInt("NumBatch", numBatchFiles);
      }
View Full Code Here

    if (sampleDocnosFile != null && fs.exists(new Path(sampleDocnosFile))) {
      job.set("Ivory.SampleFile", sampleDocnosFile);
      DistributedCache.addCacheFile(new URI(sampleDocnosFile), job);
    } else if (sampleSize != -1) {
      RetrievalEnvironment env = new RetrievalEnvironment(workDir, fs);
      int collectionSize = env.readCollectionDocumentCount();
      sampleFreq = collectionSize / (float) sampleSize;
      job.setInt("SampleFrequency", (int) sampleFreq);
    } else {
      throw new RuntimeException("Either provide sample frequency with " +
          "option -" + SAMPLESIZE_OPTION+ " or existing sample docnos with option -" + SAMPLEDOCNOS_OPTION);
View Full Code Here

    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));        

    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, fs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());  

    if (pathMapping.containsKey(modelFileName)) {
      classifier = new MoreGenericModelReader(pathMapping.get(modelFileName), localFs).constructModel();
      sLogger.info("Bitext classifier created successfully from " + pathMapping.get(modelFileName));
    }
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.