Package edu.umd.cloud9.io.map

Examples of edu.umd.cloud9.io.map.HMapIFW


   * @param threshold
   * @param scale
   * @param probMap
   */
  public static HMapSFW scaleProbMap(float threshold, float scale, HMapSFW probMap) {
    HMapSFW scaledProbMap = new HMapSFW();

    for (Entry<String> entry : probMap.entrySet()) {
      float pr = entry.getValue() * scale;
      if (pr > threshold) {
        scaledProbMap.put(entry.getKey(), pr);
      }
    }

    return scaledProbMap;
  }
View Full Code Here


   *    value between 0 and 1 that determines total probability in final distribution (e.g., 0.2 scale will scale [0.8 0.1 0.1] into [0.16 0.02 0.02])
   * @param probMaps
   *    list of probability distributions
   */
  public static HMapSFW combineProbMaps(float threshold, float scale, List<PairOfFloatMap> probMaps) {
    HMapSFW combinedProbMap = new HMapSFW();

    int numDistributions = probMaps.size();

    // get a combined set of all translation alternatives
    // compute normalization factor when sum of weights is not 1.0
    Set<String> translationAlternatives = new HashSet<String>();
    float sumWeights = 0;
    for (int i=0; i < numDistributions; i++) {
      HMapSFW dist = probMaps.get(i).getMap();
      float weight = probMaps.get(i).getWeight();

      // don't add vocabulary from a distribution that has 0 weight
      if (weight > 0) {
        translationAlternatives.addAll(dist.keySet());
        sumWeights += weight;
      }
    }

    // normalize by sumWeights
    for (String e : translationAlternatives) {
      float combinedProb = 0f;
      for (int i=0; i < numDistributions; i++) {
        HMapSFW dist = probMaps.get(i).getMap();
        float weight = probMaps.get(i).getWeight();
        combinedProb += (weight/sumWeights) * dist.get(e);    // Prob(e|f) = weighted average of all distributions
      }
      combinedProb *= scale;
      if (combinedProb > threshold) {
        combinedProbMap.put(e, combinedProb);
      }
View Full Code Here

   * @param cumProbThreshold
   * @param maxNumTrans
   */
  public static void normalize(Map<String, HMapSFW> probMap, float lexProbThreshold, float cumProbThreshold, int maxNumTrans) {
    for (String sourceTerm : probMap.keySet()) {
      HMapSFW probDist = probMap.get(sourceTerm);
      TreeSet<PairOfStringFloat> sortedFilteredProbDist = new TreeSet<PairOfStringFloat>();
      HMapSFW normProbDist = new HMapSFW();

      // compute normalization factor
      float sumProb = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        sumProb += entry.getValue();
      }

      // normalize values and remove low-prob entries based on normalized values
      float sumProb2 = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        float pr = entry.getValue() / sumProb;
        if (pr > lexProbThreshold) {
          sumProb2 += pr;
          sortedFilteredProbDist.add(new PairOfStringFloat(entry.getKey(), pr));
        }
      }

      // re-normalize values after removal of low-prob entries
      float cumProb = 0;
      int cnt = 0;
      while (cnt < maxNumTrans && cumProb < cumProbThreshold && !sortedFilteredProbDist.isEmpty()) {
        PairOfStringFloat entry = sortedFilteredProbDist.pollLast();
        float pr = entry.getValue() / sumProb2;
        cumProb += pr;
        normProbDist.put(entry.getKey(), pr);
        cnt++;
      }

      probMap.put(sourceTerm, normProbDist);
    }
View Full Code Here

    return new JUnit4TestAdapter(EnAr_TREC02.class);
  }

  public static void main(String[] args) {
    //    HMapSFW gridAPMap = array2Map(Interp_AP);
    HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2));
    HMapSFW onebestAPMap = array2Map(Onebest_AP.get(1));
    HMapSFW grammarAPMap = array2Map(grammar_AP.get(0));
    HMapSFW tokenAPMap = array2Map(baseline_token_AP);
    //    System.out.println(countNumberOfImprovedTopics(tokenAPMap, gridAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, onebestAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, grammarAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, tokenAPMap));
View Full Code Here

    }
    return cnt;
  }

  private static HMapSFW array2Map(String[] array) {
    HMapSFW map = new HMapSFW();
    for ( int i = 0; i < array.length; i += 2 ) {
      map.put(array[i], Float.parseFloat(array[i+1]));
    }
    return map;
  }
View Full Code Here

    Map<String,HMapSFW> scfgDist = new HashMap<String,HMapSFW>();

    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
    HMapSFW phraseDist = new HMapSFW();

    HMapSIW srcTokenCnt = new HMapSIW();

    Set<String> bagOfTargetTokens = new HashSet<String>();

    try {
      FSDataInputStream fis = fs.open(new Path(grammarFile));
View Full Code Here

      float sumProb2 = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        float pr = entry.getValue() / sumProb;
        if (pr > lexProbThreshold) {
          sumProb2 += pr;
          sortedFilteredProbDist.add(new PairOfStringFloat(entry.getKey(), pr));
        }
      }

      // re-normalize values after removal of low-prob entries
      float cumProb = 0;
      int cnt = 0;
      while (cnt < maxNumTrans && cumProb < cumProbThreshold && !sortedFilteredProbDist.isEmpty()) {
        PairOfStringFloat entry = sortedFilteredProbDist.pollLast();
        float pr = entry.getValue() / sumProb2;
        cumProb += pr;
        normProbDist.put(entry.getKey(), pr);
        cnt++;
      }

      probMap.put(sourceTerm, normProbDist);
    }
View Full Code Here

        String[] parts = rule.split("\\|\\|\\|");
        String[] lhs = parts[0].trim().split(" ");
        String[] rhs = parts[1].trim().split(" ");;
        for (String l : lhs) {
          for (String r : rhs) {
            pairsInSCFG.add(new PairOfStrings(l, r));
          }
        }
      }
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
View Full Code Here

    }

    // in SCFG rule such as a b X1 X2 c --> X1 d e X2 f, we want to find out the src/trg tokens that are aligned to some trg/src token, ignoring the X variable
    // we can then decide if we want to include it as a multi-token phrase in our query representation based on various heuristics (e.g., only include if no X in between of tokens)
    String fPhrase = "";
    ArrayListOfInts sourceTokenIds = new ArrayListOfInts();     
    ArrayListOfInts targetTokenIds = new ArrayListOfInts();
    int f=0;
    for (; f < lhs.length; f++) {
      String fTerm = lhs[f];
      if (queryLangTokenizer.isStopWord(fTerm) || fTerm.matches("\\[X,\\d+\\]") || fTerm.matches("<s>") || fTerm.matches("</s>")) {
        continue;
      }

      srcTokenCnt.increment(fTerm);
      sourceTokenIds.add(f);

      ArrayListOfInts ids;
      if (isPassThrough){
        ids = new ArrayListOfInts();
        ids.add(0);
      }else {
        ids = one2manyAlign.get(f);
      }

      if (ids == null || (isOne2Many == 0 && ids.size() > 1)) {
        continue;
      }

      // find phrase in LHS and match to phrase in RHS
      if (isMany2Many) {
        fPhrase += fTerm + " ";
        targetTokenIds = targetTokenIds.mergeNoDuplicates(ids);       
      }

      String eTerm = null;
      for (int e : ids) {
        eTerm = rhs[e];

        // assumption: if this is pass-through rule, re-stem token in doc-language
        if (isPassThrough || (unknownWords != null && unknownWords.contains(fTerm))) {
          eTerm = stemmed2Stemmed.get(eTerm);
        }

        if (eTerm == null || docLangTokenizer.isStopWord(eTerm)) {
          //          LOG.info("Skipped trg token " + eTerm);
          eTerm = null;
          continue;     
        }
        bagOfTargetTokens.add(eTerm);
        if (isOne2Many <= 1) {
          if (probDist.containsKey(fTerm)) {
            HMapSFW eToken2Prob = probDist.get(fTerm);
            eToken2Prob.increment(eTerm, weight);
          }else {
            HMapSFW eToken2Prob = new HMapSFW();
            eToken2Prob.put(eTerm, weight);
            probDist.put(fTerm, eToken2Prob);
          }
        }
      }

      if (isOne2Many == 2) {
        // if ids.size() > 1 eTerm is a multi-token expression
        // even if eTerm is overwritten here, we need to do above loop to update bagOfTargetTokens
        if (ids.size() > 1) {
          eTerm = isConsecutiveWithStopwords(ids, rhs, docLangTokenizer);     // <---- heuristic
        }

        // no proper translation on target-side (e.g., stopword OR non-consecutive multi-word translation), let's skip
        if (eTerm == null) {
View Full Code Here

      String[] alPair = alignment.split("-");
      int f = Integer.parseInt(alPair[0]);
      int e = Integer.parseInt(alPair[1]);

      if(!one2manyAlign.containsKey(f)){
        one2manyAlign.put(f, new ArrayListOfInts())
      }
      one2manyAlign.get(f).add(e);
    }

    // for each source token id, sort ids of its translations in ascending order
    for(Integer f : one2manyAlign.keySet()) {
      ArrayListOfInts lst = one2manyAlign.get(f);
      lst.sort();
      one2manyAlign.put(f, lst);
    }

    return one2manyAlign;
  }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.map.HMapIFW

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.