Examples of edu.umd.cloud9.webgraph.data.AnchorText

edu.umd.cloud9.io.map.HMapSFW

This data structure represents a line of anchor text. A line of anchor text has some text, a weight, and a set of sources (targets) associated with it. Sources (targets) are the pages a line of anchor text originates from (points to) when the underlying link is an incoming (outgoing) link.

The implemented iterator makes it possible to iterate through the source or target documents for each line of anchor text.
@author Nima Asadi

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);


    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();


    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(dewikiEn + "/test_wt-int-doc-vectors/part-00000")));
    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(deIntDocVector1, value);


    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(deIntDocVector2, value);
    reader.close();
  }

View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);


    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();


    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(galagoIndex + "/test_wt-int-doc-vectors/part-00000")));


    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(galagoIntDocVector1, value);


    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(galagoIntDocVector2, value);
    reader.close();
  }

View Full Code Here

    FileSystem fs = FileSystem.get(conf);


    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    WeightedIntDocVector value = new WeightedIntDocVector();
    HMapIFW map = new HMapIFW();


    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(opennlpIndex + "/test_wt-int-doc-vectors/part-00000")));
    reader.next(key, value);
    map = value.getWeightedTerms();
    System.out.println("*** top 10 terms ***");
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(opennlpIntDocVector1, value);


    reader.next(key, value);
    map = value.getWeightedTerms();
    System.out.println("*** top 10 terms ***");
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(opennlpIntDocVector2, value);
    reader.close();
  }

View Full Code Here

   *     FileSystem object
   * @return
   *     mapping from term ids to df values
   */
  public static HMapIFW readTransDfTable(Path path, FileSystem fs) {
    HMapIFW transDfTable = new HMapIFW();
    try {
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());


      IntWritable key = (IntWritable) reader.getKeyClass().newInstance();
      FloatWritable value = (FloatWritable) reader.getValueClass().newInstance();


      while (reader.next(key, value)) {
        transDfTable.put(key.get(), value.get());
        //        logger.info(key.get()+"-->"+value.get());
        key = (IntWritable) reader.getKeyClass().newInstance();
        value = (FloatWritable) reader.getValueClass().newInstance();
      }
      reader.close();

View Full Code Here

   *     contains mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, FrequencySortedDictionary dict, DfTableArray dfTable){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = e2f_probs.get(e).getTranslations(0.0f);
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        int id = dict.getId(fTerm); 
        if(id != -1){
          float df_f = dfTable.getDf(id);        
          df += (probEF*df_f);
        }else{
          logger.debug(fTerm+" not in dict");
        }
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }

View Full Code Here

   *     mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, HMapSIW dfs){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = null;
      try {
        fS = e2f_probs.get(e).getTranslations(0.0f);
      } catch (Exception e1) {
        e1.printStackTrace();
      }
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        if(!dfs.containsKey(fTerm)){  //only if word is in the collection, can it contribute to the df values.
          continue;
        }      
        float df_f = dfs.get(fTerm);
        df+=(probEF*df_f);
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }

View Full Code Here


      // add token translations into a #combine of #weight array structures
      JsonArray tokensArr = new JsonArray();
      if (tokenWeight > 0) {
        for (String srcToken : stemmedSourceTokens) {
          HMapSFW nbestDist = translation.getDistributionOf(srcToken);


          if (defaultTokenizer.isStopWord(srcToken)){
            continue;
          }
          LOG.info("Processing "+srcToken);


          // combine translations from N-best AND bilingual dictionary
          List<PairOfFloatMap> tokenRepresentationList = new ArrayList<PairOfFloatMap>();


          // Pr{bitext}
          if (bitextWeight > 0) {
            HMapSFW bitextDist = clGenerator.getTranslations(origQuery.trim(), srcToken, pairsInGrammar, stemmed2Stemmed);
            if(bitextDist != null && !bitextDist.isEmpty()){
              tokenRepresentationList.add(new PairOfFloatMap(bitextDist, bitextWeight));
            }
          }


          // Pr{scfg}
          if (scfgWeight > 0) {
            HMapSFW scfgDist = scfgGenerator.getTranslations(origQuery.trim(), srcToken, probMap, stemmed2Stemmed);
            if (scfgDist != null && !scfgDist.isEmpty() ){
              tokenRepresentationList.add(new PairOfFloatMap(scfgDist, scfgWeight));
            }
          }


          // Pr{n-best}
          if (mtWeight > 0 && nbestDist != null && !nbestDist.isEmpty()) {
            Utils.normalize(nbestDist);
            tokenRepresentationList.add(new PairOfFloatMap(nbestDist, mtWeight));
          }


          JsonArray combinedArr;
          float scale = 1;
          if (scaling) {
            scale = scale * translation.getSourceTokenCnt().get(srcToken) / ((float)translation.getCount());
          }
          if(tokenRepresentationList.size() == 0) {
            continue;       // if empty distr., do not represent this source token in query
          } else if(tokenRepresentationList.size() == 1) {
            combinedArr = Utils.createJsonArrayFromProbabilities(Utils.scaleProbMap(lexProbThreshold, scale, tokenRepresentationList.get(0).getMap()));
          } else {
            combinedArr = Utils.createJsonArrayFromProbabilities(Utils.combineProbMaps(lexProbThreshold, scale, tokenRepresentationList));
          }


          JsonObject tokenWeightedArr = new JsonObject();          
          tokenWeightedArr.add("#weight", combinedArr);


          // optional: if this source token has occurred more than once per query, reflect this in the representation
          //  for (int i = 0; i < Math.ceil(tokenCount.get(srcToken)/(float)kBest); i++) {
          //    tokensArr.put(tokenWeightedArr);
          //  }
          tokensArr.add(tokenWeightedArr);
        }
        queryTJson.add("#combine", tokensArr);
      }


      // combine the token-based and phrase-based representations into a #combweight structure
      JsonArray queryJsonArr = new JsonArray();


      HMapSFW scaledPhrase2Weight = null;
      if (phraseWeight > 0) {
        scaledPhrase2Weight = Utils.scaleProbMap(lexProbThreshold, phraseWeight, translation.getPhraseDist());      
        for (String phrase : scaledPhrase2Weight.keySet()) {
          queryJsonArr.add(new JsonPrimitive(scaledPhrase2Weight.get(phrase)));
          queryJsonArr.add(new JsonPrimitive(phrase));
        }
      }
      if (tokenWeight > 0) {
        queryJsonArr.add(new JsonPrimitive(tokenWeight));

View Full Code Here

    }   
    return probMap;
  }


  private String getBestTranslation(String query, String token) {
    HMapSFW probDist = query2probMap.get(query).get(token);


    if(probDist == null){
      return token;
    }


    float maxProb = 0f;
    String maxProbTrans = null;
    for (edu.umd.cloud9.util.map.MapKF.Entry<String> entry : probDist.entrySet()) {
      if (entry.getValue() > maxProb) {
        maxProb = entry.getValue();
        maxProbTrans = entry.getKey();
      }
    }

View Full Code Here

    }
    return maxProbTrans;
  }


  protected HMapSFW getTranslations(String query, String token, Map<String, HMapSFW> probMap, Map<String, String> stemmed2Stemmed) {
    HMapSFW probDist = null;
    try {
      probDist = probMap.get(token);
    } catch (NullPointerException e) {
      LOG.info("Prob map not found for " + query);
      e.printStackTrace();
    }
    
    if(probDist == null){
      // borrow OOV word heuristic from MT: if no translation found, include itself as translation
      probDist = new HMapSFW();
      String targetStem = stemmed2Stemmed.get(token);
      String target = (stemmed2Stemmed == null || targetStem == null) ? token : stemmed2Stemmed.get(token);
      probDist.put(target, 1);      
      return probDist;
    }


    return probDist;
  }

View Full Code Here

    });
    Interp_AP.put(2, new String[] {
        "78", "0.3167","77", "0.2599","35", "0.0019","36", "0.0033","33", "0.3573","39", "0.1078","38", "0.0","43", "0.0679","42", "0.2039","41", "0.147","40", "1.0E-4","82", "0.3175","83", "0.1541","80", "0.082","87", "0.291","84", "0.257","85", "0.0732","67", "0.1302","66", "0.0092","69", "0.0","68", "0.6626","23", "0.1562","26", "0.125","28", "0.4415","29", "0.1302","2", "0.1573","30", "0.2587","6", "0.0582","5", "0.0","32", "0.3711","70", "0.04","9", "0.2236","71", "0.5076","72", "0.383","73", "0.0065","74", "0.0679","75", "0.0034","76", "0.1072","59", "0.0353","58", "0.0933","57", "0.378","19", "0.2664","56", "0.2486","18", "0.0453","15", "0.4233","16", "0.0514","12", "0.0459","64", "0.4827","65", "0.4462","62", "0.5034","63", "0.0139","99", "0.267","61", "0.0392","100", "0.0264","98", "0.3932","49", "0.0026","97", "0.0025","48", "2.0E-4","96", "0.03","95", "3.0E-4","94", "0.4591","45", "0.3061","93", "0.0957","44", "0.2947","92", "0.2229","47", "0.0","91", "0.6519","46", "0.2098","90", "0.4246","51", "0.0","52", "0.1495","53", "0.1198","54", "0.2776"
    });


    HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2));
    HMapSFW onebestAPMap = array2Map(Onebest_AP.get(2));
    HMapSFW grammarAPMap = array2Map(grammar_AP.get(2));
    HMapSFW tokenAPMap = array2Map(baseline_token_AP);
    HMapSFW gridAPMap = array2Map(Interp_AP.get(2));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, gridAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, onebestAPMap));
    System.out.println(countNumberOfImprovedTopics(tokenAPMap, grammarAPMap));
    System.out.println(countNumberOfNegligibleTopics(tokenAPMap, gridAPMap));

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of edu.umd.cloud9.webgraph.data.AnchorText

bak.pcj.IntIterator

edu.umd.cloud9.example.bfs.BfsNodeTest

edu.umd.cloud9.example.pagerank.PageRankNodeTest

edu.umd.cloud9.io.array.ArrayListOfIntsWritable

edu.umd.cloud9.io.fastutil.Int2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.Int2IntOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2IntOpenHashMapWritableTest

edu.umd.cloud9.io.map.HMapIDW

edu.umd.cloud9.io.map.HMapIDWTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.