Examples of ivory.core.RetrievalEnvironment

ivory.core.RetrievalEnvironment
@author Don Metzler @author Jimmy Lin

    Configuration conf = parseArgs(args);
    FileSystem fs = FileSystem.get(conf);
    String indexRootPath = conf.get(Constants.IndexPath);
    String collection = conf.get(Constants.CollectionPath);


    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
    Path mappingFile = env.getDocnoMappingData();
    new TrecDocnoMappingBuilder().build(new Path(collection), mappingFile, conf);


    conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);
    conf.set(Constants.InputFormat, TrecDocumentInputFormat.class.getCanonicalName());

View Full Code Here

  private HashMap<String, Accumulator[]> results;
  private DocnoMapping docnoMapping;


  public StructuredQueryRanker(String indexPath, FileSystem fs, int numResults) throws IOException,
  ConfigurationException {
    this.env = new RetrievalEnvironment(indexPath, fs);
    env.initialize(true);


    this.numResults = numResults;
    results = new HashMap<String, Accumulator[]>();
    docnoMapping = getDocnoMapping();

View Full Code Here

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
      LOG.info("Index path doesn't exist, creating...");
      fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);


    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
      LOG.info(mappingFile + " doesn't exist, creating...");
      String[] arr = new String[] {
          "-input=" + rawCollection,
          "-output_path=" + indexRootPath + "/wiki-docid-tmp",
          "-output_file=" + mappingFile.toString() };
      LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));


      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);


      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    }


    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    LOG.info(seqCollection + " doesn't exist, creating...");
    String[] arr = new String[] { "-input=" + rawCollection,
        "-output=" + seqCollection,
        "-mapping_file=" + mappingFile.toString(),
        "-compression_type=block",
        "-wiki_language=" + collectionLang };
    LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));


    RepackWikipedia tool = new RepackWikipedia();
    tool.setConf(conf);
    tool.run(arr);


    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass);      //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt(Constants.MinDf, MinDF);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.TermIndexWindow, TermIndexWindow);


    // Builds term doc vectors from document collection, and filters the terms that are not included
    // in Ivory.SrcVocab.
    long startTime = System.currentTimeMillis();
    long preprocessStartTime = System.currentTimeMillis();
    LOG.info("Building term doc vectors...");
    int exitCode = new BuildTermDocVectors(conf).run();
    if (exitCode >= 0) {
      LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }else {
      LOG.info("Error: BuildTermDocVectors. Terminating...");
      return -1;
    }


    // Get CF and DF counts.
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    exitCode = new ComputeGlobalTermStatistics(conf).run();
    LOG.info("TermCount = " + env.readCollectionTermCount());
    if (exitCode >= 0) {
      LOG.info("Job ComputeGlobalTermStatistics finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }else {
      LOG.info("Error: ComputeGlobalTermStatistics. Terminating...");
      return -1;
    }
    // Build a map from terms to sequentially generated integer term ids.
    startTime = System.currentTimeMillis();
    LOG.info("Building term-to-integer id mapping...");
    exitCode = new BuildDictionary(conf).run();
    if (exitCode >= 0) {
      LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }else{
      LOG.info("Error: BuildDictionary. Terminating...");
      return -1;
    }


    // Compute term weights, and output weighted term doc vectors.
    LOG.info("Building weighted term doc vectors...");
    startTime = System.currentTimeMillis();


    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);


    if (mode == CROSS_LINGUAL_F) {
      // Translate term doc vectors into English.
      exitCode = new BuildTranslatedTermDocVectors(conf).run();
    } else {
      // Build weighted term doc vectors.
      exitCode = new BuildWeightedTermDocVectors(conf).run();
    }
    if (exitCode >= 0) {
      LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }else {
      LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating...");
      return -1;
    }


    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if (mode == MONO_LINGUAL) {
      exitCode = new BuildIntDocVectors(conf).run();
      exitCode = new BuildWeightedIntDocVectors(conf).run();
      if (exitCode >= 0) {
        LOG.info("Job BuildWeightedIntDocVectors finished in "+(System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
      }else {
        LOG.info("Error: BuildWeightedIntDocVectors. Terminating...");
        return -1;
      }
    } else {
      BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool =
        new BuildTargetLangWeightedIntDocVectors(conf);


      int finalNumDocs = weightedIntVectorsTool.run();


      LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " +
          (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
      if (finalNumDocs > 0) {
        LOG.info("Changed doc count: " + env.readCollectionDocumentCount() +" => " + finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }else {
        LOG.info("No document output! Terminating...");
        return -1;
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
        engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
      env.writeCollectionTermCount(engVocabH.size());
    }


    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds");


    return 0;

View Full Code Here

  public IntDocVectorsForwardIndex(String indexPath, FileSystem fs, boolean weighted)
      throws IOException {
    this.fs = Preconditions.checkNotNull(fs);
    this.conf = fs.getConf();


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    path = (weighted ? env.getWeightedIntDocVectorsDirectory() : env.getIntDocVectorsDirectory());


    String forwardIndexPath = (weighted ? env.getWeightedIntDocVectorsForwardIndex()
        : env.getIntDocVectorsForwardIndex());
    FSDataInputStream posInput = fs.open(new Path(forwardIndexPath));


    docnoOffset = posInput.readInt();
    collectionDocumentCount = posInput.readInt();

View Full Code Here

  public TermDocVectorsForwardIndex(String indexPath, FileSystem fs) throws IOException {
    Preconditions.checkNotNull(indexPath);
    this.fs = Preconditions.checkNotNull(fs);
    conf = fs.getConf();


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    path = env.getTermDocVectorsDirectory();


    FSDataInputStream posInput = fs.open(new Path(env.getTermDocVectorsForwardIndex()));


    docnoOffset = posInput.readInt();
    collectionDocumentCount = posInput.readInt();


    positions = new long[collectionDocumentCount];

View Full Code Here

    LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset));
    LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, numReducers));
    LOG.info(String.format(" - %s: %s", Constants.CollectionVocab, conf.get(Constants.CollectionVocab)));
    LOG.info(String.format(" - %s: %s", Constants.StopwordList, conf.get(Constants.StopwordList)));


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();


    if (!fs.exists(mappingFile)) {
      LOG.error("Error, docno mapping data file " + mappingFile + " doesn't exist!");
      return 0;
    }


    DistributedCache.addCacheFile(mappingFile.toUri(), conf);


    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
      LOG.info("TermDocVectors already exist: Skipping!");
      return 0;
    }


    env.writeCollectionName(collectionName);
    env.writeCollectionPath(collectionPath);
    env.writeInputFormat(inputFormat);
    env.writeDocnoMappingClass(mappingClass);
    env.writeTokenizerClass(tokenizer);
    env.writeDocnoOffset(docnoOffset);


    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.set("mapred.task.timeout", "6000000");      // needed for stragglers (e.g., very long documents in Wikipedia)


    Job job1 = new Job(conf,
        BuildTermDocVectors.class.getSimpleName() + ":" + collectionName);
    job1.setJarByClass(BuildTermDocVectors.class);


    job1.setNumReduceTasks(numReducers);


    FileInputFormat.addInputPaths(job1, collectionPath);
    FileOutputFormat.setOutputPath(job1, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD);


    job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);


    job1.setMapOutputKeyClass(IntWritable.class);
    job1.setMapOutputValueClass(LazyTermDocVector.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(LazyTermDocVector.class);


    job1.setMapperClass(MyMapper.class);


    long startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    // Write out number of postings.
    int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue();
    env.writeCollectionDocumentCount(collectionDocCount);


    Path dlFile = env.getDoclengthsData();
    if (fs.exists(dlFile)) {
      LOG.info("DocLength data exists: Skipping!");
      return 0;
    }


    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);
    conf.set(InputPath, env.getDoclengthsDirectory().toString());
    conf.set(DocLengthDataFile, dlFile.toString());


    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);


    LOG.info("Writing doc length data to " + dlFile + "...");


    Job job2 = new Job(conf, "DocLengthTable:" + collectionName);
    job2.setJarByClass(BuildTermDocVectors.class);


    job2.setNumReduceTasks(0);
    job2.setInputFormatClass(NullInputFormat.class);
    job2.setOutputFormatClass(NullOutputFormat.class);
    job2.setMapperClass(DocLengthDataWriterMapper.class);


    startTime = System.currentTimeMillis();
    job2.waitForCompletion(true);
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    long collectionSumOfDocLengths =
        job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue();
    env.writeCollectionAverageDocumentLength(
        (float) collectionSumOfDocLengths / collectionDocCount);


    return 0;
  }

View Full Code Here

      getPairsInSCFG(conf.get(Constants.SCFGPath));
    }


    // initialize environment to access index
    try {
      env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);
      env.initialize(true);
    } catch (ConfigurationException e) {
      e.printStackTrace();
    }
  }

View Full Code Here

    } else {
      LOG.info("Index directory " + p + " already exists!");
      return -1;
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
    Path mappingFile = env.getDocnoMappingData();
    new TrecWebDocnoMappingBuilder().build(new Path(collection), mappingFile, conf);


    conf.set(Constants.CollectionName, "Wt10g");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);

View Full Code Here

        docMapping =
          (DocnoMapping) Class.forName(conf.get(Constants.DocnoMappingClass)).newInstance();


        // Take a different code path if we're in standalone mode.
        if (conf.get("mapred.job.tracker").equals("local")) {
          RetrievalEnvironment env = new RetrievalEnvironment(
              context.getConfiguration().get(Constants.IndexPath), localFs);
          docMapping.loadMapping(env.getDocnoMappingData(), localFs);
        } else {
          Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
          // Load the docid to docno mappings. Assume file 0.
          docMapping.loadMapping(localFiles[0], localFs);
        }

View Full Code Here

    }
    LOG.info("Bigram segmentation = " + bigramSegment);


    // initialize environment to access index
    try {
      env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);
      env.initialize(true);
    } catch (ConfigurationException e) {
      e.printStackTrace();
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of ivory.core.RetrievalEnvironment

edu.umd.cloud9.collection.DocnoMapping

ivory.app.PreprocessClueWebEnglish

ivory.app.PreprocessClueWebEnglishMultipleSegments

ivory.app.PreprocessCollection

ivory.app.PreprocessMedline

ivory.app.PreprocessTextCollection

ivory.app.PreprocessTrecForeign

ivory.app.PreprocessWikipedia

ivory.bloomir.preprocessing.GenerateBloomFilters

ivory.bloomir.preprocessing.GenerateCompressedPostings

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.