Package edu.umd.cloud9.util.map

Examples of edu.umd.cloud9.util.map.HMapIF$Values


  public DocnoMapping getDocnoMapping() throws IOException {
    return loadDocnoMapping(indexPath, fs);
  }

  public static DocnoMapping loadDocnoMapping(String indexPath, FileSystem fs) throws IOException {
    DocnoMapping mDocMapping = null;
    // load the docid to docno mappings
    try {
      LOG.info("Loading DocnoMapping file...");
      RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

      String className = env.readDocnoMappingClass();
      LOG.info(" - Class name: " + className);
      mDocMapping = (DocnoMapping) Class.forName(className).newInstance();

      Path mappingFile = env.getDocnoMappingData();
      LOG.info(" - File name: " + mappingFile);
      mDocMapping.loadMapping(mappingFile, fs);
      LOG.info("Done!");
    } catch (Exception e) {
      throw new IOException("Error initializing DocnoMapping!");
    }
    return mDocMapping;
View Full Code Here


      return -1;
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();
    new ClueWarcDocnoMappingBuilder().build(new Path(collection), mappingFile, conf);

    conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment);
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
View Full Code Here

    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
      sLogger.info(mappingFile + " doesn't exist, creating...");
      String[] arr = new String[] { collection, indexPath + "/medline-docid-tmp",  mappingFile.toString(), new Integer(numMappers).toString() };
      NumberMedlineCitations tool = new NumberMedlineCitations();
      tool.setConf(conf);
      tool.run(arr);

      fs.delete(new Path(indexPath + "/medline-docid-tmp"), true);
    }

    // Now we're ready to start the preprocessing pipeline... set
View Full Code Here

    if (!fs.exists(mappingFile)) {
      LOG.info("docno-mapping.dat doesn't exist, creating...");
      String[] arr = new String[] { collection, mappingDir.toString(),
              mappingFile.toString() };
      NumberTrecDocuments2 tool = new NumberTrecDocuments2();
      tool.setConf(conf);
      tool.run(arr);

      fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "TREC_vol45");
View Full Code Here

    if (!fs.exists(mappingFile)) {
      LOG.info("docno-mapping.dat doesn't exist, creating...");
      String[] arr = new String[] { collection, mappingDir.toString(),
          mappingFile.toString(), "100" };
      NumberTrecWebDocuments tool = new NumberTrecWebDocuments();
      tool.setConf(conf);
      tool.run(arr);

      fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "Wt10g");
View Full Code Here

          "-output_path=" + indexRootPath + "/wiki-docid-tmp",
          "-output_file=" + mappingFile.toString(),
          "-lang=" + collectionLang };
      LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));

      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);

      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    }

    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    LOG.info(seqCollection + " doesn't exist, creating...");
    String[] arr = new String[] { "-input=" + rawCollection,
        "-output=" + seqCollection,
        "-mapping_file=" + mappingFile.toString(),
        "-compression_type=block",
        "-wiki_language=" + collectionLang };
    LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

    RepackWikipedia tool = new RepackWikipedia();
    tool.setConf(conf);
    tool.run(arr);

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
View Full Code Here

        "-mapping_file=" + mappingFile.toString(),
        "-compression_type=block",
        "-wiki_language=" + collectionLang };
    LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

    RepackWikipedia tool = new RepackWikipedia();
    tool.setConf(conf);
    tool.run(arr);

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
View Full Code Here

   *     FileSystem object
   * @return
   *     mapping from term ids to df values
   */
  public static HMapIFW readTransDfTable(Path path, FileSystem fs) {
    HMapIFW transDfTable = new HMapIFW();
    try {
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

      IntWritable key = (IntWritable) reader.getKeyClass().newInstance();
      FloatWritable value = (FloatWritable) reader.getValueClass().newInstance();

      while (reader.next(key, value)) {
        transDfTable.put(key.get(), value.get());
        //        logger.info(key.get()+"-->"+value.get());
        key = (IntWritable) reader.getKeyClass().newInstance();
        value = (FloatWritable) reader.getValueClass().newInstance();
      }
      reader.close();
View Full Code Here

   *     ttable E-->F (i.e., Pr(f|e))
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, FrequencySortedDictionary dict, DfTableArray dfTable){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = e2f_probs.get(e).getTranslations(0.0f);
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        int id = dict.getId(fTerm);
        if(id != -1){
          float df_f = dfTable.getDf(id);       
          df += (probEF*df_f);
        }else{
          logger.debug(fTerm+" not in dict");
        }
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here

   *     mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, HMapSIW dfs){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = null;
      try {
        fS = e2f_probs.get(e).getTranslations(0.0f);
      } catch (Exception e1) {
        e1.printStackTrace();
      }
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        if(!dfs.containsKey(fTerm)){  //only if word is in the collection, can it contribute to the df values.
          continue;
        }     
        float df_f = dfs.get(fTerm);
        df+=(probEF*df_f);
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.util.map.HMapIF$Values

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.