Package edu.umd.cloud9.collection.wikipedia

Examples of edu.umd.cloud9.collection.wikipedia.WikipediaPage


          "-output_path=" + indexRootPath + "/wiki-docid-tmp",
          "-output_file=" + mappingFile.toString(),
          "-wiki_language=" + collectionLang };
      LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));

      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);

      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
      LOG.info("Docno mapping already exists at: " + mappingFile);
    }

    // Repack Wikipedia into sequential compressed block
    if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { "-input=" + rawCollection,
          "-output=" + seqCollection,
          "-mapping_file=" + mappingFile.toString(),
          "-compression_type=block",
          "-wiki_language=" + collectionLang };
      LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    } else {
      LOG.info("Repacked collection already exists at: " + seqCollection);     
    }

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
View Full Code Here


        "-mapping_file=" + mappingFile.toString(),
        "-compression_type=block",
        "-wiki_language=" + collectionLang };
    LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

    RepackWikipedia tool = new RepackWikipedia();
    tool.setConf(conf);
    tool.run(arr);

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
View Full Code Here

    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    if (!fs.exists(p)) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { rawCollection, seqCollection, mappingFile.toString(), "block"};
      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    }

    conf.set("Ivory.CollectionName", "Wikipedia-"+collectionLang);
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);
View Full Code Here

          "-mapping_file=" + mappingFile.toString(),
          "-compression_type=block",
          "-wiki_language=" + collectionLang };
      LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    } else {
      LOG.info("Repacked collection already exists at: " + seqCollection);     
    }

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
View Full Code Here

      }
    }

    public void map(Writable docnoKey, Indexable page, OutputCollector<PairOfInts, WikiDocInfo> output, Reporter reporter) throws IOException {
      int docno = ((IntWritable)docnoKey).get();
      WikipediaPage p = (WikipediaPage) page;
      String lang = p.getLanguage();
      ArrayListOfIntsWritable similarDocnos;

      // we only load the mapping once, during the first map() call of a mapper.
      // this works b/c all input kv pairs of a given mapper will have same lang id (reason explained above)
      if(pwsimMapping.isEmpty()){
        loadPairs(pwsimMapping, lang, mJob, reporter);
        sLogger.debug(pwsimMapping.size());
      }
     
      // if no similar docs for docno, return
      if(pwsimMapping.containsKey(docno)){
        similarDocnos = pwsimMapping.get(docno);  
      }else{
        return;
      }

      ArrayListWritable<Text> sentences;
      ArrayListWritable<HMapSFW> vectors = new ArrayListWritable<HMapSFW>();
      ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable();
      try {
        if(lang.equals("en")){
          // identify sentences in document, filter out ones below MinSentLength threshold
          // convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length
          // filter out sentences for which the vector has less than MinVectorTerms terms
          sentences = helper.getESentences(p.getContent(), vectors, sentLengths);   
         
        }else{
          sentences = helper.getFSentences(p.getContent(), vectors, sentLengths);
        }
        if(sentences.size() != vectors.size()) {
          throw new RuntimeException("Sentences.size != Vectors.size");
        }
      } catch (Exception e) {
View Full Code Here

        if (fileStats[i].getPath().getName().startsWith("_")) {
          continue;
        }

        LOG.info("processing " + fileStats[i].getPath());
        FSLineReader reader = new FSLineReader(fileStats[i].getPath(), fs);

        Text line = new Text();
        while (reader.readLine(line) > 0) {
          String[] arr = line.toString().split("\\t+", 2);

          int docno = Integer.parseInt(arr[0]);
          int len = Integer.parseInt(arr[1]);

          // Note that because of speculative execution there may be
          // multiple copies of doclength data. Therefore, we can't
          // just count number of doclengths read. Instead, keep track
          // of largest docno encountered.
          if (docno < docnoOffset) {
            throw new RuntimeException(
                "Error: docno " + docno + " < docnoOffset " + docnoOffset + "!");
          }

          doclengths[docno - docnoOffset] = len;

          if (docno > maxDocno) {
            maxDocno = docno;
          }
          if (docno < minDocno) {
            minDocno = docno;
          }
        }
        reader.close();
        context.getCounter(DocLengths.Files).increment(1);
      }

      LOG.info("min docno: " + minDocno);
      LOG.info("max docno: " + maxDocno);
View Full Code Here

   * @param size
   *     number of bits
   */
  public MinhashSignature(){
    super();
    terms = new ArrayListOfIntsWritable();
  }
View Full Code Here

    super();
    terms = new ArrayListOfIntsWritable();
  }

  public MinhashSignature(ArrayListOfIntsWritable b){
    terms = new ArrayListOfIntsWritable(b);
  }
View Full Code Here

    this(other.terms);
  }

  public MinhashSignature(int numTerms){      //need this constructor for general purposes.
    super();
    terms = new ArrayListOfIntsWritable(numTerms);
  }
View Full Code Here


  @Override
  public int hammingDistance(Signature signature, int threshold){
    MinhashSignature s2 = (MinhashSignature) signature;
    ArrayListOfIntsWritable l1 = this.terms;
    ArrayListOfIntsWritable l2 = s2.terms;

    int count=0;
    for(int i=0;i<l1.size();i++){

      int i1 = l1.get(i), i2=l2.get(i);
      if(i1!=i2){
        count++;
        if(count>threshold){
          return count;
        }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.collection.wikipedia.WikipediaPage

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.