Examples of ivory.core.RetrievalEnvironment.readCollectionDocumentCount()

ivory.core.RetrievalEnvironment.readCollectionDocumentCount()

    eTok = TokenizerFactory.createTokenizer(fs, eLang, tokenizerFile, true, conf.get("eStopword"), conf.get("eStemmedStopword"), null);
    sLogger.info("Tokenizer and vocabs created successfully.");


    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());


    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);
  }

View Full Code Here

    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));         


    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, localFs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());   


    classifier = new MoreGenericModelReader(new Path(conf.get("modelFileName")), localFs).constructModel();
  }


  private void loadEModels(Configuration conf) throws Exception {

View Full Code Here

    eTok = TokenizerFactory.createTokenizer(localFs, eLang, tokenizerFile, true, conf.get("eStopword"), null, null);
    sLogger.info("Tokenizer and vocabs created successfully.");


    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());


    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), localFs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), localFs);
  }

View Full Code Here

    PwsimEnvironment.setClassTypes(config);
    int batchSize = -1;
    if (cmdline.hasOption(BATCH_OPTION)) {
      batchSize = Integer.parseInt(cmdline.getOptionValue(BATCH_OPTION));
      if (batchSize > 0) {
        int numDocs = env.readCollectionDocumentCount();
        numBatchFiles = numDocs / batchSize;
        if(numDocs % batchSize > 0) numBatchFiles++;
        System.out.println("numBatchFiles: "+numBatchFiles);
        config.setInt("NumBatch", numBatchFiles);
      }

View Full Code Here

      int finalNumDocs = weightedIntVectorsTool.run();


      LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " +
          (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
      if (finalNumDocs > 0) {
        LOG.info("Changed doc count: " + env.readCollectionDocumentCount() +" => " + finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }else {
        LOG.info("No document output! Terminating...");
        return -1;
      }

View Full Code Here

    FileSystem fs = FileSystem.get(getConf());


    RetrievalEnvironment re = new RetrievalEnvironment(indexPath, fs);


    String collectionName = re.readCollectionName();
    int numDocs = re.readCollectionDocumentCount();
    Path docLengthPath = re.getDoclengthsData();
    String scoringModel = getConf().get("Ivory.ScoringModel");


    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);

View Full Code Here

      mDLTable = new DocLengthTable4B(env.getDoclengthsData(), fs);
    } catch (IOException e1) {
      throw new RuntimeException("Error initializing Doclengths file");
    }
    LOG.info(mDLTable.getAvgDocLength()+" is average source-language document length.");
    LOG.info(targetEnv.readCollectionDocumentCount()+" is number of target-language docs. We use the target-side DF table so we set #docs to this value in our scoring model.");


    /////// Configuration setup


    conf.set(Constants.IndexPath, indexPath);
    conf.set("Ivory.ScoringModel", scoringModel);

View Full Code Here

    /////// Configuration setup


    conf.set(Constants.IndexPath, indexPath);
    conf.set("Ivory.ScoringModel", scoringModel);
    conf.setFloat("Ivory.AvgDocLen", mDLTable.getAvgDocLength());
    conf.setInt(Constants.CollectionDocumentCount, targetEnv.readCollectionDocumentCount());
    conf.set(Constants.Language, getConf().get("Ivory.Lang"));
    conf.set("Ivory.Normalize", getConf().get("Ivory.Normalize"));
    conf.set("Ivory.MinNumTerms", getConf().get("Ivory.MinNumTerms"));


    conf.setNumMapTasks(300);

View Full Code Here


    String collectionName = env.readCollectionName();


    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();
    //int maxHeap = conf.getInt(Constants.MaxHeap, 2048);


    String postingsType = conf.get(Constants.PostingsListsType,
        PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")

View Full Code Here

      int finalNumDocs = weightedIntVectorsTool.run();


      LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " +
          (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
      if (finalNumDocs > 0) {
        LOG.info("Changed doc count: " + env.readCollectionDocumentCount() +" => " + finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }else {
        LOG.info("No document output! Terminating...");
        return -1;
      }

View Full Code Here

0 1 2 3 4 5 6

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.