Examples of edu.umd.cloud9.io.map.HMapIFW

edu.umd.cloud9.util.array.ArrayListOfInts

Writable representing a map where keys are ints and values are floats.

One notable feature of this class is the ability to support lazy decoding, controlled by the {@link #setLazyDecodeFlag(boolean)} method. In lazydecoding mode, when an object of this type is deserialized, key-value pairs are not inserted into the map, but rather held in arrays. The reduces memory used in cases where random access to values is not required. In lazy decoding mode, the raw keys and values may be fetched by the {@link #getKeys()} and{@link #getValues()} methods, respectively. The map can be subsequentlypopulated with the {@link #decode()} method.
@author Jimmy Lin

   * @param threshold
   * @param scale
   * @param probMap
   */
  public static HMapSFW scaleProbMap(float threshold, float scale, HMapSFW probMap) {
    HMapSFW scaledProbMap = new HMapSFW();


    for (Entry<String> entry : probMap.entrySet()) {
      float pr = entry.getValue() * scale;
      if (pr > threshold) {
        scaledProbMap.put(entry.getKey(), pr);
      }
    }


    return scaledProbMap;
  }

View Full Code Here

   *    value between 0 and 1 that determines total probability in final distribution (e.g., 0.2 scale will scale [0.8 0.1 0.1] into [0.16 0.02 0.02])
   * @param probMaps
   *    list of probability distributions
   */
  public static HMapSFW combineProbMaps(float threshold, float scale, List<PairOfFloatMap> probMaps) {
    HMapSFW combinedProbMap = new HMapSFW();


    int numDistributions = probMaps.size();


    // get a combined set of all translation alternatives
    // compute normalization factor when sum of weights is not 1.0
    Set<String> translationAlternatives = new HashSet<String>();
    float sumWeights = 0;
    for (int i=0; i < numDistributions; i++) {
      HMapSFW dist = probMaps.get(i).getMap();
      float weight = probMaps.get(i).getWeight();


      // don't add vocabulary from a distribution that has 0 weight
      if (weight > 0) {
        translationAlternatives.addAll(dist.keySet());
        sumWeights += weight;
      }
    }


    // normalize by sumWeights
    for (String e : translationAlternatives) {
      float combinedProb = 0f;
      for (int i=0; i < numDistributions; i++) {
        HMapSFW dist = probMaps.get(i).getMap();
        float weight = probMaps.get(i).getWeight();
        combinedProb += (weight/sumWeights) * dist.get(e);    // Prob(e|f) = weighted average of all distributions
      }
      combinedProb *= scale;
      if (combinedProb > threshold) {
        combinedProbMap.put(e, combinedProb);
      }

View Full Code Here

   * @param cumProbThreshold
   * @param maxNumTrans
   */
  public static void normalize(Map<String, HMapSFW> probMap, float lexProbThreshold, float cumProbThreshold, int maxNumTrans) {
    for (String sourceTerm : probMap.keySet()) {
      HMapSFW probDist = probMap.get(sourceTerm);
      TreeSet<PairOfStringFloat> sortedFilteredProbDist = new TreeSet<PairOfStringFloat>();
      HMapSFW normProbDist = new HMapSFW();


      // compute normalization factor
      float sumProb = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        sumProb += entry.getValue(); 
      }


      // normalize values and remove low-prob entries based on normalized values
      float sumProb2 = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        float pr = entry.getValue() / sumProb;
        if (pr > lexProbThreshold) {
          sumProb2 += pr;
          sortedFilteredProbDist.add(new PairOfStringFloat(entry.getKey(), pr));
        }
      }


      // re-normalize values after removal of low-prob entries
      float cumProb = 0;
      int cnt = 0;
      while (cnt < maxNumTrans && cumProb < cumProbThreshold && !sortedFilteredProbDist.isEmpty()) {
        PairOfStringFloat entry = sortedFilteredProbDist.pollLast();
        float pr = entry.getValue() / sumProb2;
        cumProb += pr;
        normProbDist.put(entry.getKey(), pr);
        cnt++;
      }


      probMap.put(sourceTerm, normProbDist);
    }

View Full Code Here

    return new JUnit4TestAdapter(EnAr_TREC02.class);
  }


  public static void main(String[] args) {
    //    HMapSFW gridAPMap = array2Map(Interp_AP);
    HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2));
    HMapSFW onebestAPMap = array2Map(Onebest_AP.get(1));
    HMapSFW grammarAPMap = array2Map(grammar_AP.get(0));
    HMapSFW tokenAPMap = array2Map(baseline_token_AP);
    //    System.out.println(countNumberOfImprovedTopics(tokenAPMap, gridAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, onebestAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, grammarAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, tokenAPMap));

View Full Code Here

    }
    return cnt;
  }


  private static HMapSFW array2Map(String[] array) {
    HMapSFW map = new HMapSFW();
    for ( int i = 0; i < array.length; i += 2 ) {
      map.put(array[i], Float.parseFloat(array[i+1]));
    }
    return map;
  }

View Full Code Here

    Map<String,HMapSFW> scfgDist = new HashMap<String,HMapSFW>();


    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
    HMapSFW phraseDist = new HMapSFW();


    HMapSIW srcTokenCnt = new HMapSIW();


    Set<String> bagOfTargetTokens = new HashSet<String>();


    try {
      FSDataInputStream fis = fs.open(new Path(grammarFile));

View Full Code Here

      float sumProb2 = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        float pr = entry.getValue() / sumProb;
        if (pr > lexProbThreshold) {
          sumProb2 += pr;
          sortedFilteredProbDist.add(new PairOfStringFloat(entry.getKey(), pr));
        }
      }


      // re-normalize values after removal of low-prob entries
      float cumProb = 0;
      int cnt = 0;
      while (cnt < maxNumTrans && cumProb < cumProbThreshold && !sortedFilteredProbDist.isEmpty()) {
        PairOfStringFloat entry = sortedFilteredProbDist.pollLast();
        float pr = entry.getValue() / sumProb2;
        cumProb += pr;
        normProbDist.put(entry.getKey(), pr);
        cnt++;
      }


      probMap.put(sourceTerm, normProbDist);
    }

View Full Code Here

        String[] parts = rule.split("\\|\\|\\|");
        String[] lhs = parts[0].trim().split(" ");
        String[] rhs = parts[1].trim().split(" ");;
        for (String l : lhs) {
          for (String r : rhs) {
            pairsInSCFG.add(new PairOfStrings(l, r));
          }
        }
      }
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();

View Full Code Here

    }


    // in SCFG rule such as a b X1 X2 c --> X1 d e X2 f, we want to find out the src/trg tokens that are aligned to some trg/src token, ignoring the X variable
    // we can then decide if we want to include it as a multi-token phrase in our query representation based on various heuristics (e.g., only include if no X in between of tokens)
    String fPhrase = "";
    ArrayListOfInts sourceTokenIds = new ArrayListOfInts();      
    ArrayListOfInts targetTokenIds = new ArrayListOfInts();
    int f=0;
    for (; f < lhs.length; f++) {
      String fTerm = lhs[f];
      if (queryLangTokenizer.isStopWord(fTerm) || fTerm.matches("\\[X,\\d+\\]") || fTerm.matches("<s>") || fTerm.matches("</s>")) {
        continue;
      }


      srcTokenCnt.increment(fTerm);
      sourceTokenIds.add(f);


      ArrayListOfInts ids;
      if (isPassThrough){
        ids = new ArrayListOfInts();
        ids.add(0);
      }else {
        ids = one2manyAlign.get(f);
      }


      if (ids == null || (isOne2Many == 0 && ids.size() > 1)) {
        continue;
      }


      // find phrase in LHS and match to phrase in RHS
      if (isMany2Many) {
        fPhrase += fTerm + " ";
        targetTokenIds = targetTokenIds.mergeNoDuplicates(ids);        
      }


      String eTerm = null;
      for (int e : ids) {
        eTerm = rhs[e];


        // assumption: if this is pass-through rule, re-stem token in doc-language
        if (isPassThrough || (unknownWords != null && unknownWords.contains(fTerm))) {
          eTerm = stemmed2Stemmed.get(eTerm);
        }


        if (eTerm == null || docLangTokenizer.isStopWord(eTerm)) {
          //          LOG.info("Skipped trg token " + eTerm);
          eTerm = null;
          continue;      
        }
        bagOfTargetTokens.add(eTerm);
        if (isOne2Many <= 1) {
          if (probDist.containsKey(fTerm)) {
            HMapSFW eToken2Prob = probDist.get(fTerm);
            eToken2Prob.increment(eTerm, weight);
          }else {
            HMapSFW eToken2Prob = new HMapSFW();
            eToken2Prob.put(eTerm, weight);
            probDist.put(fTerm, eToken2Prob);
          }
        }
      }


      if (isOne2Many == 2) {
        // if ids.size() > 1 eTerm is a multi-token expression
        // even if eTerm is overwritten here, we need to do above loop to update bagOfTargetTokens
        if (ids.size() > 1) {
          eTerm = isConsecutiveWithStopwords(ids, rhs, docLangTokenizer);     // <---- heuristic
        }


        // no proper translation on target-side (e.g., stopword OR non-consecutive multi-word translation), let's skip
        if (eTerm == null) {

View Full Code Here

      String[] alPair = alignment.split("-");
      int f = Integer.parseInt(alPair[0]);
      int e = Integer.parseInt(alPair[1]);


      if(!one2manyAlign.containsKey(f)){
        one2manyAlign.put(f, new ArrayListOfInts());  
      }
      one2manyAlign.get(f).add(e);
    }


    // for each source token id, sort ids of its translations in ascending order
    for(Integer f : one2manyAlign.keySet()) {
      ArrayListOfInts lst = one2manyAlign.get(f);
      lst.sort();
      one2manyAlign.put(f, lst);
    }


    return one2manyAlign;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of edu.umd.cloud9.io.map.HMapIFW

bak.pcj.IntIterator

edu.umd.cloud9.example.bfs.BfsNodeTest

edu.umd.cloud9.example.pagerank.PageRankNodeTest

edu.umd.cloud9.io.array.ArrayListOfIntsWritable

edu.umd.cloud9.io.fastutil.Int2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.Int2IntOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2IntOpenHashMapWritableTest

edu.umd.cloud9.io.map.HMapIDW

edu.umd.cloud9.io.map.HMapIDWTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.