Package edu.umd.cloud9.webgraph.data

Examples of edu.umd.cloud9.webgraph.data.AnchorText


    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(dewikiEn + "/test_wt-int-doc-vectors/part-00000")));
    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(deIntDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(deIntDocVector2, value);
    reader.close();
  }
View Full Code Here


    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(galagoIndex + "/test_wt-int-doc-vectors/part-00000")));

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(galagoIntDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(galagoIntDocVector2, value);
    reader.close();
  }
View Full Code Here

    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    WeightedIntDocVector value = new WeightedIntDocVector();
    HMapIFW map = new HMapIFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(opennlpIndex + "/test_wt-int-doc-vectors/part-00000")));
    reader.next(key, value);
    map = value.getWeightedTerms();
    System.out.println("*** top 10 terms ***");
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(opennlpIntDocVector1, value);

    reader.next(key, value);
    map = value.getWeightedTerms();
    System.out.println("*** top 10 terms ***");
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(opennlpIntDocVector2, value);
    reader.close();
  }
View Full Code Here

   *     FileSystem object
   * @return
   *     mapping from term ids to df values
   */
  public static HMapIFW readTransDfTable(Path path, FileSystem fs) {
    HMapIFW transDfTable = new HMapIFW();
    try {
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

      IntWritable key = (IntWritable) reader.getKeyClass().newInstance();
      FloatWritable value = (FloatWritable) reader.getValueClass().newInstance();

      while (reader.next(key, value)) {
        transDfTable.put(key.get(), value.get());
        //        logger.info(key.get()+"-->"+value.get());
        key = (IntWritable) reader.getKeyClass().newInstance();
        value = (FloatWritable) reader.getValueClass().newInstance();
      }
      reader.close();
View Full Code Here

   *     contains mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, FrequencySortedDictionary dict, DfTableArray dfTable){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = e2f_probs.get(e).getTranslations(0.0f);
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        int id = dict.getId(fTerm);
        if(id != -1){
          float df_f = dfTable.getDf(id);       
          df += (probEF*df_f);
        }else{
          logger.debug(fTerm+" not in dict");
        }
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here

   *     mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, HMapSIW dfs){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = null;
      try {
        fS = e2f_probs.get(e).getTranslations(0.0f);
      } catch (Exception e1) {
        e1.printStackTrace();
      }
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        if(!dfs.containsKey(fTerm)){  //only if word is in the collection, can it contribute to the df values.
          continue;
        }     
        float df_f = dfs.get(fTerm);
        df+=(probEF*df_f);
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here

      // add token translations into a #combine of #weight array structures
      JsonArray tokensArr = new JsonArray();
      if (tokenWeight > 0) {
        for (String srcToken : stemmedSourceTokens) {
          HMapSFW nbestDist = translation.getDistributionOf(srcToken);

          if (defaultTokenizer.isStopWord(srcToken)){
            continue;
          }
          LOG.info("Processing "+srcToken);

          // combine translations from N-best AND bilingual dictionary
          List<PairOfFloatMap> tokenRepresentationList = new ArrayList<PairOfFloatMap>();

          // Pr{bitext}
          if (bitextWeight > 0) {
            HMapSFW bitextDist = clGenerator.getTranslations(origQuery.trim(), srcToken, pairsInGrammar, stemmed2Stemmed);
            if(bitextDist != null && !bitextDist.isEmpty()){
              tokenRepresentationList.add(new PairOfFloatMap(bitextDist, bitextWeight));
            }
          }

          // Pr{scfg}
          if (scfgWeight > 0) {
            HMapSFW scfgDist = scfgGenerator.getTranslations(origQuery.trim(), srcToken, probMap, stemmed2Stemmed);
            if (scfgDist != null && !scfgDist.isEmpty() ){
              tokenRepresentationList.add(new PairOfFloatMap(scfgDist, scfgWeight));
            }
          }

          // Pr{n-best}
          if (mtWeight > 0 && nbestDist != null && !nbestDist.isEmpty()) {
            Utils.normalize(nbestDist);
            tokenRepresentationList.add(new PairOfFloatMap(nbestDist, mtWeight));
          }

          JsonArray combinedArr;
          float scale = 1;
          if (scaling) {
            scale = scale * translation.getSourceTokenCnt().get(srcToken) / ((float)translation.getCount());
          }
          if(tokenRepresentationList.size() == 0) {
            continue;       // if empty distr., do not represent this source token in query
          } else if(tokenRepresentationList.size() == 1) {
            combinedArr = Utils.createJsonArrayFromProbabilities(Utils.scaleProbMap(lexProbThreshold, scale, tokenRepresentationList.get(0).getMap()));
          } else {
            combinedArr = Utils.createJsonArrayFromProbabilities(Utils.combineProbMaps(lexProbThreshold, scale, tokenRepresentationList));
          }

          JsonObject tokenWeightedArr = new JsonObject();         
          tokenWeightedArr.add("#weight", combinedArr);

          // optional: if this source token has occurred more than once per query, reflect this in the representation
          //  for (int i = 0; i < Math.ceil(tokenCount.get(srcToken)/(float)kBest); i++) {
          //    tokensArr.put(tokenWeightedArr);
          //  }
          tokensArr.add(tokenWeightedArr);
        }
        queryTJson.add("#combine", tokensArr);
      }

      // combine the token-based and phrase-based representations into a #combweight structure
      JsonArray queryJsonArr = new JsonArray();

      HMapSFW scaledPhrase2Weight = null;
      if (phraseWeight > 0) {
        scaledPhrase2Weight = Utils.scaleProbMap(lexProbThreshold, phraseWeight, translation.getPhraseDist());     
        for (String phrase : scaledPhrase2Weight.keySet()) {
          queryJsonArr.add(new JsonPrimitive(scaledPhrase2Weight.get(phrase)));
          queryJsonArr.add(new JsonPrimitive(phrase));
        }
      }
      if (tokenWeight > 0) {
        queryJsonArr.add(new JsonPrimitive(tokenWeight));
View Full Code Here

    }  
    return probMap;
  }

  private String getBestTranslation(String query, String token) {
    HMapSFW probDist = query2probMap.get(query).get(token);

    if(probDist == null){
      return token;
    }

    float maxProb = 0f;
    String maxProbTrans = null;
    for (edu.umd.cloud9.util.map.MapKF.Entry<String> entry : probDist.entrySet()) {
      if (entry.getValue() > maxProb) {
        maxProb = entry.getValue();
        maxProbTrans = entry.getKey();
      }
    }
View Full Code Here

    }
    return maxProbTrans;
  }

  protected HMapSFW getTranslations(String query, String token, Map<String, HMapSFW> probMap, Map<String, String> stemmed2Stemmed) {
    HMapSFW probDist = null;
    try {
      probDist = probMap.get(token);
    } catch (NullPointerException e) {
      LOG.info("Prob map not found for " + query);
      e.printStackTrace();
    }
   
    if(probDist == null){
      // borrow OOV word heuristic from MT: if no translation found, include itself as translation
      probDist = new HMapSFW();
      String targetStem = stemmed2Stemmed.get(token);
      String target = (stemmed2Stemmed == null || targetStem == null) ? token : stemmed2Stemmed.get(token);
      probDist.put(target, 1);     
      return probDist;
    }

    return probDist;
  }
View Full Code Here

    });
    Interp_AP.put(2, new String[] {
        "78", "0.3167","77", "0.2599","35", "0.0019","36", "0.0033","33", "0.3573","39", "0.1078","38", "0.0","43", "0.0679","42", "0.2039","41", "0.147","40", "1.0E-4","82", "0.3175","83", "0.1541","80", "0.082","87", "0.291","84", "0.257","85", "0.0732","67", "0.1302","66", "0.0092","69", "0.0","68", "0.6626","23", "0.1562","26", "0.125","28", "0.4415","29", "0.1302","2", "0.1573","30", "0.2587","6", "0.0582","5", "0.0","32", "0.3711","70", "0.04","9", "0.2236","71", "0.5076","72", "0.383","73", "0.0065","74", "0.0679","75", "0.0034","76", "0.1072","59", "0.0353","58", "0.0933","57", "0.378","19", "0.2664","56", "0.2486","18", "0.0453","15", "0.4233","16", "0.0514","12", "0.0459","64", "0.4827","65", "0.4462","62", "0.5034","63", "0.0139","99", "0.267","61", "0.0392","100", "0.0264","98", "0.3932","49", "0.0026","97", "0.0025","48", "2.0E-4","96", "0.03","95", "3.0E-4","94", "0.4591","45", "0.3061","93", "0.0957","44", "0.2947","92", "0.2229","47", "0.0","91", "0.6519","46", "0.2098","90", "0.4246","51", "0.0","52", "0.1495","53", "0.1198","54", "0.2776"
    });

    HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2));
    HMapSFW onebestAPMap = array2Map(Onebest_AP.get(2));
    HMapSFW grammarAPMap = array2Map(grammar_AP.get(2));
    HMapSFW tokenAPMap = array2Map(baseline_token_AP);
    HMapSFW gridAPMap = array2Map(Interp_AP.get(2));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, gridAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, onebestAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, grammarAPMap));
    System.out.println(countNumberOfNegligibleTopics(tokenAPMap, gridAPMap));
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.webgraph.data.AnchorText

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.