Package edu.umd.cloud9.util.map

Examples of edu.umd.cloud9.util.map.HMapII


  public DocnoMapping getDocnoMapping() throws IOException {
    return loadDocnoMapping(indexPath, fs);
  }

  public static DocnoMapping loadDocnoMapping(String indexPath, FileSystem fs) throws IOException {
    DocnoMapping mDocMapping = null;
    // load the docid to docno mappings
    try {
      LOG.info("Loading DocnoMapping file...");
      RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

      String className = env.readDocnoMappingClass();
      LOG.info(" - Class name: " + className);
      mDocMapping = (DocnoMapping) Class.forName(className).newInstance();

      Path mappingFile = env.getDocnoMappingData();
      LOG.info(" - File name: " + mappingFile);
      mDocMapping.loadMapping(mappingFile, fs);
      LOG.info("Done!");
    } catch (Exception e) {
      throw new IOException("Error initializing DocnoMapping!");
    }
    return mDocMapping;
View Full Code Here


      return -1;
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();
    new ClueWarcDocnoMappingBuilder().build(new Path(collection), mappingFile, conf);

    conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment);
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
View Full Code Here

          "-output_path=" + indexRootPath + "/wiki-docid-tmp",
          "-output_file=" + mappingFile.toString(),
          "-lang=" + collectionLang };
      LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));

      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);

      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    }

    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    LOG.info(seqCollection + " doesn't exist, creating...");
    String[] arr = new String[] { "-input=" + rawCollection,
        "-output=" + seqCollection,
        "-mapping_file=" + mappingFile.toString(),
        "-compression_type=block",
        "-wiki_language=" + collectionLang };
    LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

    RepackWikipedia tool = new RepackWikipedia();
    tool.setConf(conf);
    tool.run(arr);

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
View Full Code Here

        "-mapping_file=" + mappingFile.toString(),
        "-compression_type=block",
        "-wiki_language=" + collectionLang };
    LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

    RepackWikipedia tool = new RepackWikipedia();
    tool.setConf(conf);
    tool.run(arr);

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
View Full Code Here

   *     FileSystem object
   * @return
   *     mapping from term ids to df values
   */
  public static HMapIFW readTransDfTable(Path path, FileSystem fs) {
    HMapIFW transDfTable = new HMapIFW();
    try {
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

      IntWritable key = (IntWritable) reader.getKeyClass().newInstance();
      FloatWritable value = (FloatWritable) reader.getValueClass().newInstance();

      while (reader.next(key, value)) {
        transDfTable.put(key.get(), value.get());
        //        logger.info(key.get()+"-->"+value.get());
        key = (IntWritable) reader.getKeyClass().newInstance();
        value = (FloatWritable) reader.getValueClass().newInstance();
      }
      reader.close();
View Full Code Here

   *     ttable E-->F (i.e., Pr(f|e))
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, FrequencySortedDictionary dict, DfTableArray dfTable){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = e2f_probs.get(e).getTranslations(0.0f);
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        int id = dict.getId(fTerm);
        if(id != -1){
          float df_f = dfTable.getDf(id);       
          df += (probEF*df_f);
        }else{
          logger.debug(fTerm+" not in dict");
        }
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here

   *     mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, HMapSIW dfs){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = null;
      try {
        fS = e2f_probs.get(e).getTranslations(0.0f);
      } catch (Exception e1) {
        e1.printStackTrace();
      }
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        if(!dfs.containsKey(fTerm)){  //only if word is in the collection, can it contribute to the df values.
          continue;
        }     
        float df_f = dfs.get(fTerm);
        df+=(probEF*df_f);
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here

        if (e1.getDocno() < mBlockStart)
          continue;
        if (e1.getDocno() >= mBlockEnd)
          break;

        HMapIFW map = new HMapIFW();

        sLogger.debug(key + ": " + e1);

        PostingsReader reader2 = postings.getPostingsReader();

        while (reader2.nextPosting(e2)) {

          sLogger.debug(key + ": " + e1 + ", " + e2);

          if (e1.getDocno() == e2.getDocno())
            continue;

          // compute partial score of similarity for a pair of
          // documents
          float weight = mModel.computeScore(e1.getScore(), e2.getScore(),
              mDocLengthTable.getDocLength(e1.getDocno()), mDocLengthTable
                  .getDocLength(e2.getDocno()));

          map.put(e2.getDocno(), weight);
        }
        output.collect(new IntWritable(e1.getDocno()), map);
      }
    }
View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(enwikiEn + "/test_wt-int-doc-vectors/part-00000")));

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(enIntDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(enIntDocVector2, value);
    reader.close();
  }
View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(dewikiEn + "/test_wt-int-doc-vectors/part-00000")));
    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(deIntDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(deIntDocVector2, value);
    reader.close();
  }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.util.map.HMapII

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.