Examples of HMapIFW


Examples of edu.umd.cloud9.io.map.HMapIFW

      return s;
    }

    public void map(IntWritable docno, WeightedIntDocVector docvectorIn,
        OutputCollector<IntWritable, NBitSignature> output, Reporter reporter) throws IOException {
      HMapIFW docvector = docvectorIn.getWeightedTerms();
      FloatAsBytesWritable value;

      for (int i = 0; i < randomUnitVectors.size(); i++) {
        value = (FloatAsBytesWritable) randomUnitVectors.get(i);
        double dprod = dotProduct(docvector, value);
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

      }else if (numTerms < MIN_SIZE) {
        reporter.incrCounter(Docs.SHORT, 1);
        return;
      }

      HMapIFW tfS = new HMapIFW();
      // We simply use the source-language doc length since the ratio of doc length to average doc
      // length is unlikely to change significantly (not worth complicating the pipeline)
      int docLen = CLIRUtils.translateTFs(doc, tfS, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg,
          e2f_Probs, f2e_Probs, tokenizer, LOG);
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

     

      DefaultFrequencySortedDictionary dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs2);
      DfTableArray dfTable = new DfTableArray(new Path(dfByIntFile), fs2);

      HMapIFW transDfTable = CLIRUtils.translateDFTable(eVocab_e2f, fVocab_e2f, en2DeProbs, dict, dfTable);

      SequenceFile.Writer writer = SequenceFile.createWriter(fs2, conf, new Path(transDfFile), IntWritable.class, FloatWritable.class);
      for(MapIF.Entry term : transDfTable.entrySet()){
        reporter.incrCounter(DF.TransDf, 1);
        writer.append(new IntWritable(term.getKey()), new FloatWritable(term.getValue()));
      }
      writer.close();
    }
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

    }

    public void map(IntWritable docno, WeightedIntDocVector docvectorIn,
        OutputCollector<IntWritable, MinhashSignature> output, Reporter reporter)
    throws IOException {
      HMapIFW docvector = docvectorIn.getWeightedTerms();
      signature.clear();

      for (int i = 0; i < randomOrderings.size(); i++) {
        int minTerm = getMinHashTerm(docvector, (ArrayListOfIntsWritable) randomOrderings.get(i));
        signature.add(minTerm);
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

    for(String term : terms){
      term2Tf.increment(term);
    }

    //translated tf values
    HMapIFW transTermTf = new HMapIFW();
    for(Entry<String> entry : term2Tf.entrySet()){
      String fTerm = entry.getKey();
      int tf = entry.getValue();
      // transTermTf won't be updated if fTerm not in vocab
      transTermTf = CLIRUtils.updateTFsByTerm(fTerm, tf, transTermTf, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, eTok, sLogger);
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

    Tokenizer tokenizer = TokenizerFactory.createTokenizer(eLang,
        eTokenizerModelFile, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);

    // translate doc texts here
    for (HMapSIW deDoc : docs) {
      HMapIFW tfS = new HMapIFW();
      int docLen = 0;
      try {
        docLen = CLIRUtils.translateTFs(deDoc, tfS, eVocabSrc, eVocabTrg, fVocabSrc,
            fVocabTrg, e2f_Probs, f2e_Probs, tokenizer , null);   // tokenizer just for stopword list
      } catch (IOException e) {
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

      if (numTerms < MIN_SIZE) {
        reporter.incrCounter(Docs.SHORT, 1);
        return;
      }

      HMapIFW tfS = new HMapIFW();
      // We simply use the source-language doc length since the ratio of doc length to average doc
      // length is unlikely to change significantly (not worth complicating the pipeline)
      int docLen = CLIRUtils.translateTFs(doc, tfS, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg,
          e2f_Probs, f2e_Probs, tokenizer, LOG);
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

     

      DefaultFrequencySortedDictionary dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs2);
      DfTableArray dfTable = new DfTableArray(new Path(dfByIntFile), fs2);

      HMapIFW transDfTable = CLIRUtils.translateDFTable(eVocab_e2f, fVocab_e2f, en2DeProbs, dict, dfTable);

      SequenceFile.Writer writer = SequenceFile.createWriter(fs2, conf, new Path(transDfFile), IntWritable.class, FloatWritable.class);
      for(MapIF.Entry term : transDfTable.entrySet()){
        reporter.incrCounter(DF.TransDf, 1);
        writer.append(new IntWritable(term.getKey()), new FloatWritable(term.getValue()));
      }
      writer.close();
    }
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

   *     FileSystem object
   * @return
   *     mapping from term ids to df values
   */
  public static HMapIFW readTransDfTable(Path path, FileSystem fs) {
    HMapIFW transDfTable = new HMapIFW();
    try {
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

      IntWritable key = (IntWritable) reader.getKeyClass().newInstance();
      FloatWritable value = (FloatWritable) reader.getValueClass().newInstance();

      while (reader.next(key, value)) {
        transDfTable.put(key.get(), value.get());
        //        logger.info(key.get()+"-->"+value.get());
        key = (IntWritable) reader.getKeyClass().newInstance();
        value = (FloatWritable) reader.getValueClass().newInstance();
      }
      reader.close();
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapIFW

   *     ttable E-->F (i.e., Pr(f|e))
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, FrequencySortedDictionary dict, DfTableArray dfTable){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = e2f_probs.get(e).getTranslations(0.0f);
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        int id = dict.getId(fTerm);
        if(id != -1){
          float df_f = dfTable.getDf(id);       
          df += (probEF*df_f);
        }else{
          logger.debug(fTerm+" not in dict");
        }
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.