Examples of ivory.core.RetrievalEnvironment.readCollectionDocumentCount()

ivory.core.RetrievalEnvironment.readCollectionDocumentCount()

      sLogger.info("reading stats for " + index);


      RetrievalEnvironment env = new RetrievalEnvironment(index, fs);


      long l = env.readCollectionLength();
      int n = env.readCollectionDocumentCount();


      sLogger.info(" - CollectionLength: " + l);
      sLogger.info(" - CollectionDocumentCount: " + n);


      collectionLength += l;

View Full Code Here

    }
    LOG.info(mDLTable.getAvgDocLength()+" is average doc len.");
    LOG.info(mDLTable.getDocCount()+" is num docs.");


    conf.setFloat("Ivory.AvgDocLen", mDLTable.getAvgDocLength());
    conf.setInt("Ivory.CollectionDocumentCount", env.readCollectionDocumentCount());
    
    conf.setNumMapTasks(300);      
    conf.setNumReduceTasks(0);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.map.max.attempts", 10);

View Full Code Here

    FileSystem fs = FileSystem.get(job2);
    
    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    int blockSize = getConf().getInt("Ivory.BlockSize", 0);    
    int numDocs = env.readCollectionDocumentCount();
    int numBlocks = numDocs / blockSize + 1;


    String inputPath = null;
    for (int i = 0; i < numBlocks; i++) {
      inputPath = conf.get("Ivory.PCPOutputPath")+"/block"+i;      //one block of output of PCP algorithm

View Full Code Here


    config.set("Ivory.CollectionName", targetEnv.readCollectionName()+"_"+srcEnv.readCollectionName());
    config.set("Ivory.IndexPath", targetLangDir);


    // collection size is the sum of the two collections' sizes
    int collSize = targetEnv.readCollectionDocumentCount()+srcEnv.readCollectionDocumentCount();
    config.setInt("Ivory.CollectionDocumentCount", collSize);


    ///////Parameters/////////////
    numOfBits = Integer.parseInt(args[2]);
    signatureType = args[3].toLowerCase();

View Full Code Here

    int batchSize = -1;
    try {
      if(batchSizeGiven){
        batchSize = Integer.parseInt(args[3]);
        if(batchSize>0){
          int numDocs = env.readCollectionDocumentCount();
          numBatchFiles = numDocs / batchSize;
          if(numDocs % batchSize > 0) numBatchFiles++;
          System.out.println("numBatchFiles: "+numBatchFiles);
          config.setInt("NumBatch", numBatchFiles);
        }

View Full Code Here

      BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(conf);
      LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


      int finalNumDocs = weightedIntVectorsTool.run();
      if(finalNumDocs > 0){
        LOG.info("Changed doc count from "+env.readCollectionDocumentCount() + " to = "+finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {

View Full Code Here

    if (sampleDocnosFile != null && fs.exists(new Path(sampleDocnosFile))) {
      job.set("Ivory.SampleFile", sampleDocnosFile);
      DistributedCache.addCacheFile(new URI(sampleDocnosFile), job);
    } else if (sampleSize != -1) {
      RetrievalEnvironment env = new RetrievalEnvironment(workDir, fs);
      int collectionSize = env.readCollectionDocumentCount();
      sampleFreq = collectionSize / (float) sampleSize; 
      job.setInt("SampleFrequency", (int) sampleFreq);
    } else {
      throw new RuntimeException("Either provide sample frequency with " +
          "option -" + SAMPLESIZE_OPTION+ " or existing sample docnos with option -" + SAMPLEDOCNOS_OPTION);

View Full Code Here

    PwsimEnvironment.setClassTypes(signatureType, config);
    int batchSize = -1;
    if (cmdline.hasOption(BATCH_OPTION)) {
      batchSize = Integer.parseInt(cmdline.getOptionValue(BATCH_OPTION));
      if (batchSize > 0) {
        int numDocs = env.readCollectionDocumentCount();
        numBatchFiles = numDocs / batchSize;
        if(numDocs % batchSize > 0) numBatchFiles++;
        System.out.println("numBatchFiles: "+numBatchFiles);
        config.setInt("NumBatch", numBatchFiles);
      }

View Full Code Here

    if (sampleDocnosFile != null && fs.exists(new Path(sampleDocnosFile))) {
      job.set("Ivory.SampleFile", sampleDocnosFile);
      DistributedCache.addCacheFile(new URI(sampleDocnosFile), job);
    } else if (sampleSize != -1) {
      RetrievalEnvironment env = new RetrievalEnvironment(workDir, fs);
      int collectionSize = env.readCollectionDocumentCount();
      sampleFreq = collectionSize / (float) sampleSize; 
      job.setInt("SampleFrequency", (int) sampleFreq);
    } else {
      throw new RuntimeException("Either provide sample frequency with " +
          "option -" + SAMPLESIZE_OPTION+ " or existing sample docnos with option -" + SAMPLEDOCNOS_OPTION);

View Full Code Here

    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));         


    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, fs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());   


    if (pathMapping.containsKey(modelFileName)) {
      classifier = new MoreGenericModelReader(pathMapping.get(modelFileName), localFs).constructModel();
      sLogger.info("Bitext classifier created successfully from " + pathMapping.get(modelFileName));
    }

View Full Code Here

0 1 2 3 4 5 6

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.