Examples of edu.umd.cloud9.io.map.HMapIIW

edu.umd.cloud9.io.pair.PairOfFloats

Writable representing a map where both keys and values are ints.

One notable feature of this class is the ability to support lazy decoding, controlled by the {@link #setLazyDecodeFlag(boolean)} method. In lazydecoding mode, when an object of this type is deserialized, key-value pairs are not inserted into the map, but rather held in arrays. The reduces memory used in cases where random access to values is not required. In lazy decoding mode, the raw keys and values may be fetched by the {@link #getKeys()} and{@link #getValues()} methods, respectively. The map can be subsequentlypopulated with the {@link #decode()} method.
@author Jimmy Lin

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);


    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();


    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(opennlpIndex + "/test_wt-term-doc-vectors/part-00000")));


    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(opennlpTermDocVector1, value);


    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(opennlpTermDocVector2, value);
    reader.close();
  }

View Full Code Here

    Map<String,HMapSFW> scfgDist = new HashMap<String,HMapSFW>();


    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
    HMapSFW phraseDist = new HMapSFW();


    HMapSIW srcTokenCnt = new HMapSIW();


    Set<String> bagOfTargetTokens = new HashSet<String>();


    try {
      FSDataInputStream fis = fs.open(new Path(grammarFile));

View Full Code Here

      return null;
    }
    PriorityQueue<PairOfFloatInt> eS = f2eProbs.get(f).getTranslationsWithProbs(lexProbThreshold);


    if (!eS.isEmpty()) {
      PairOfFloatInt entry = eS.poll();
      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);
      return eTerm;
    }
    return token;
  }

View Full Code Here


    float sumProbEF = 0;
    int numTrans = 0;
    //tf(e) = sum_f{tf(f)*prob(e|f)}
    while (numTrans < numTransPerToken && !eS.isEmpty()) {
      PairOfFloatInt entry = eS.poll();
      float probEF = entry.getLeftElement();
      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);


      //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);


      if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && (translateOnly == null || !translateOnly.equals("indri") || indriPuncPattern.matcher(eTerm).matches()) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {

View Full Code Here

            curIndex = prevIndex;    // revert curIndex value since we're skipping this one
            skipTerm = true;
            continue;
          }
          logger.debug("Processing: "+srcTerm+" with index: "+curIndex);      
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
          logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");      
        }else if(!earlyTerminate && !skipTerm && !delims.contains(srcTerm)){  //continue adding translation term,prob pairs (except if early termination is ON)
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
          logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");      


          // keep top numTrans translations
          if(topTrans.size() > numTrans){
            PairOfFloatString pair = topTrans.pollFirst();
            float removedProb = pair.getLeftElement();
            sumOfProbs -= removedProb;
            logger.debug("Removed from queue: "+pair.getRightElement()+" (sum: "+sumOfProbs+")");      
          }
        }else{
          logger.debug("Skipped line: "+line);
        }
      }

View Full Code Here

          continue;
        }
        prob = ttable.get(srcIndex, trgIndex);
        logger.debug("Found: " + trgTerm + " with " + prob);


        topTrans.add(new PairOfFloatString(prob, trgTerm));
        // keep top numTrans translations
        if (topTrans.size() > numTrans) {
          float removedProb = topTrans.pollFirst().getLeftElement();
          sumOfProbs -= removedProb;
        }

View Full Code Here

    List<Integer> sortedIndices = new ArrayList<Integer>();
    HMapIF index2ProbMap = new HMapIF();


    float sumOfProbs = 0.0f;    //only extract the top K<15 if the mass prob. exceeds MAX_probThreshold
    while(!topTrans.isEmpty() && sumOfProbs < cumProbThreshold){
      PairOfFloatString e = topTrans.pollLast();
      String term = e.getRightElement();
      float pr = e.getLeftElement()/cumProb;    // normalize
      logger.debug(term+"-->"+pr);
      int trgIndex = trgVocab.addOrGet(term);
      sumOfProbs += e.getLeftElement();         // keep track of unnormalized cumulative prob for determining cutoff
      sortedIndices.add(trgIndex);
      index2ProbMap.put(trgIndex, pr);
    }


    // to enable faster access with binary search, we sort entries by vocabulary index.

View Full Code Here

          int e2 = eVocabTrg.get(eTerm);         


          float prob2 = f2e_Probs.get(f2, e2);
          float prob = prob1*prob2;
          sumOfProbs += prob;
          topTrans.add(new PairOfFloatString(prob, fTerm));
        }
        logger.info("Adding "+eTerm);
        addToTable(e1, topTrans, sumOfProbs, table, fVocabTrg, 1.0f, stats);      
      }
      logger.info(stats);

View Full Code Here

    if (featSet > 2) {
      // uppercase token matching features : find uppercased tokens that exactly appear on both sides
      // lack of this evidence does not imply anything, but its existence might indicate parallel
//      fSentence.replaceAll("([',:;.?%!])", " $1 ");
//      eSentence.replaceAll("([',:;.?%!])", " $1 ");
      PairOfFloats pair = getUppercaseRatio(fTokenizer.processContent(fSentence), eTokenizer.processContent(eSentence));
      features.add("uppercaseratio1=" + pair.getLeftElement() );
      features.add("uppercaseratio2=" + pair.getRightElement() );
    }


    if (featSet > 3) {
      // future work = count number of single/double letter words in src and trg side

View Full Code Here

    // now, read tokens in first sentence and keep track of sequences of uppercased tokens in buffer
    HashSet<String> upperCaseMap1 = getUppercaseParts(tokens1);
    HashSet<String> upperCaseMap2 = getUppercaseParts(tokens2);
    float cntUpperRatio1 = getRatio(upperCaseMap1, upperCaseMap2);
    float cntUpperRatio2 = getRatio(upperCaseMap2, upperCaseMap1);
    PairOfFloats result = new PairOfFloats(cntUpperRatio1, cntUpperRatio2);
    return result;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of edu.umd.cloud9.io.map.HMapIIW

bak.pcj.IntIterator

edu.umd.cloud9.example.bfs.BfsNodeTest

edu.umd.cloud9.example.pagerank.PageRankNodeTest

edu.umd.cloud9.io.array.ArrayListOfIntsWritable

edu.umd.cloud9.io.fastutil.Int2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.Int2IntOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2IntOpenHashMapWritableTest

edu.umd.cloud9.io.map.HMapIDW

edu.umd.cloud9.io.map.HMapIDWTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.