Examples of edu.umd.cloud9.webgraph.data.AnchorText

edu.umd.cloud9.io.map.HMapIFW

This data structure represents a line of anchor text. A line of anchor text has some text, a weight, and a set of sources (targets) associated with it. Sources (targets) are the pages a line of anchor text originates from (points to) when the underlying link is an incoming (outgoing) link.

The implemented iterator makes it possible to iterate through the source or target documents for each line of anchor text.
@author Nima Asadi

          "-output_path=" + indexRootPath + "/wiki-docid-tmp",
          "-output_file=" + mappingFile.toString(),
          "-wiki_language=" + collectionLang };
      LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));


      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);


      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
      LOG.info("Docno mapping already exists at: " + mappingFile);
    }


    // Repack Wikipedia into sequential compressed block
    if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { "-input=" + rawCollection,
          "-output=" + seqCollection,
          "-mapping_file=" + mappingFile.toString(),
          "-compression_type=block",
          "-wiki_language=" + collectionLang };
      LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));


      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    } else {
      LOG.info("Repacked collection already exists at: " + seqCollection);      
    }


    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);

View Full Code Here

        "-mapping_file=" + mappingFile.toString(),
        "-compression_type=block",
        "-wiki_language=" + collectionLang };
    LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));


    RepackWikipedia tool = new RepackWikipedia();
    tool.setConf(conf);
    tool.run(arr);


    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);

View Full Code Here

    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    if (!fs.exists(p)) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { rawCollection, seqCollection, mappingFile.toString(), "block"};
      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    }


    conf.set("Ivory.CollectionName", "Wikipedia-"+collectionLang);
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

View Full Code Here

          "-mapping_file=" + mappingFile.toString(),
          "-compression_type=block",
          "-wiki_language=" + collectionLang };
      LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));


      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    } else {
      LOG.info("Repacked collection already exists at: " + seqCollection);      
    }


    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);

View Full Code Here

        if (fileStats[i].getPath().getName().startsWith("_")) {
          continue;
        }


        LOG.info("processing " + fileStats[i].getPath());
        FSLineReader reader = new FSLineReader(fileStats[i].getPath(), fs);


        Text line = new Text();
        while (reader.readLine(line) > 0) {
          String[] arr = line.toString().split("\\t+", 2);


          int docno = Integer.parseInt(arr[0]);
          int len = Integer.parseInt(arr[1]);


          // Note that because of speculative execution there may be
          // multiple copies of doclength data. Therefore, we can't
          // just count number of doclengths read. Instead, keep track
          // of largest docno encountered.
          if (docno < docnoOffset) {
            throw new RuntimeException(
                "Error: docno " + docno + " < docnoOffset " + docnoOffset + "!");
          }


          doclengths[docno - docnoOffset] = len;


          if (docno > maxDocno) {
            maxDocno = docno;
          }
          if (docno < minDocno) {
            minDocno = docno;
          }
        }
        reader.close();
        context.getCounter(DocLengths.Files).increment(1);
      }


      LOG.info("min docno: " + minDocno);
      LOG.info("max docno: " + maxDocno);

View Full Code Here

   *     FileSystem object
   * @return
   *     mapping from term ids to df values
   */
  public static HMapIFW readTransDfTable(Path path, FileSystem fs) {
    HMapIFW transDfTable = new HMapIFW();
    try {
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());


      IntWritable key = (IntWritable) reader.getKeyClass().newInstance();
      FloatWritable value = (FloatWritable) reader.getValueClass().newInstance();


      while (reader.next(key, value)) {
        transDfTable.put(key.get(), value.get());
        //        logger.info(key.get()+"-->"+value.get());
        key = (IntWritable) reader.getKeyClass().newInstance();
        value = (FloatWritable) reader.getValueClass().newInstance();
      }
      reader.close();

View Full Code Here

   *     ttable E-->F (i.e., Pr(f|e))
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, FrequencySortedDictionary dict, DfTableArray dfTable){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = e2f_probs.get(e).getTranslations(0.0f);
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        int id = dict.getId(fTerm); 
        if(id != -1){
          float df_f = dfTable.getDf(id);        
          df += (probEF*df_f);
        }else{
          logger.debug(fTerm+" not in dict");
        }
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }

View Full Code Here

   *     mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, HMapSIW dfs){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = null;
      try {
        fS = e2f_probs.get(e).getTranslations(0.0f);
      } catch (Exception e1) {
        e1.printStackTrace();
      }
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        if(!dfs.containsKey(fTerm)){  //only if word is in the collection, can it contribute to the df values.
          continue;
        }      
        float df_f = dfs.get(fTerm);
        df+=(probEF*df_f);
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }

View Full Code Here

        if (e1.getDocno() < mBlockStart)
          continue;
        if (e1.getDocno() >= mBlockEnd)
          break;


        HMapIFW map = new HMapIFW();


        sLogger.debug(key + ": " + e1);


        PostingsReader reader2 = postings.getPostingsReader();


        while (reader2.nextPosting(e2)) {


          sLogger.debug(key + ": " + e1 + ", " + e2);


          if (e1.getDocno() == e2.getDocno())
            continue;


          // compute partial score of similarity for a pair of
          // documents
          float weight = mModel.computeScore(e1.getScore(), e2.getScore(),
              mDocLengthTable.getDocLength(e1.getDocno()), mDocLengthTable
                  .getDocLength(e2.getDocno()));


          map.put(e2.getDocno(), weight);
        }
        output.collect(new IntWritable(e1.getDocno()), map);
      }
    }

View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);


    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();


    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(enwikiEn + "/test_wt-int-doc-vectors/part-00000")));


    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(enIntDocVector1, value);


    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(enIntDocVector2, value);
    reader.close();
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of edu.umd.cloud9.webgraph.data.AnchorText

bak.pcj.IntIterator

edu.umd.cloud9.example.bfs.BfsNodeTest

edu.umd.cloud9.example.pagerank.PageRankNodeTest

edu.umd.cloud9.io.array.ArrayListOfIntsWritable

edu.umd.cloud9.io.fastutil.Int2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.Int2IntOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2IntOpenHashMapWritableTest

edu.umd.cloud9.io.map.HMapIDW

edu.umd.cloud9.io.map.HMapIDWTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.