Package edu.umd.cloud9.io.array

Examples of edu.umd.cloud9.io.array.ArrayListOfIntsWritable


      String lang = p.getLanguage();
      int langID;

      ArrayListWritable<Text> sentences;
      ArrayListWritable<HMapSFW> vectors = new ArrayListWritable<HMapSFW>();
      ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable();
      // identify sentences in document, filter out ones below MinSentLength threshold
      // convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length
      // filter out sentences for which the vector has less than MinVectorTerms terms
  
      try {
        String article = p.getContent();
        if (lang.equals(eLang)) {
          sentences = helper.getESentences(article, vectors, sentLengths);   
          langID = CLIRUtils.E;
        }else if (lang.equals(fLang)){
          // Turkish Wiki articles' XML does not encode paragraph breaks
          if (lang.equals("tr")) {
            article = article.replaceAll("\\.", ". ");
          }
          sentences = helper.getFSentences(article, vectors, sentLengths);
          langID = CLIRUtils.F;
        }else {
          throw new RuntimeException("Unknown language: " + lang);
        }
      } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Error in sentence detection for language: " + lang + ", helper: " + helper + " article title: " + p.getTitle());
      }

      // documents with no sentences (after we filter out some by length)
      if (sentences.size() == 0) {
        if (langID == CLIRUtils.E) {
          reporter.incrCounter(Docs.EEmpty, 1)
        }else {
          reporter.incrCounter(Docs.FEmpty, 1);
        }
      }else{
        if (langID == CLIRUtils.E) {
          reporter.incrCounter(Docs.E, 1);
        }else {
          reporter.incrCounter(Docs.F, 1);
        }
      }  

      for (int i = 0; i < sentences.size(); i++) {
        if (langID == CLIRUtils.E) {
          if (helper.getEOOVRate(sentences.get(i).toString()) > maxOOVRate ) {
            reporter.incrCounter(Sentences.OOV, 1);
            continue;
          }         
          reporter.incrCounter(Sentences.ELength, sentLengths.get(i));
          reporter.incrCounter(Sentences.E, 1);   
        }else {
          if (helper.getFOOVRate(sentences.get(i).toString()) > maxOOVRate ) {
            reporter.incrCounter(Sentences.OOV, 1);
            continue;
          }
          reporter.incrCounter(Sentences.FLength, sentLengths.get(i));
          reporter.incrCounter(Sentences.F, 1);   
        }
        keyOut.set(docno, langID);     
        valOut.set(langID, sentences.get(i), vectors.get(i));
        output.collect(keyOut, valOut);
View Full Code Here


   * @param i
   *     length of permutation
   */
  public PermutationByBit(int i){
    rand = new Random();
    randPerm = new ArrayListOfIntsWritable();
    for(int j = 0; j < i; j++){
      randPerm.add(j);
    }
    length = i;
  }
View Full Code Here

      int i = rand.nextInt(length);
      int j = randPerm.get(i);
      randPerm.set(i, randPerm.get(k));
      randPerm.set(k, j);
    }
    return new ArrayListOfIntsWritable(randPerm);
 
View Full Code Here

  public static void main(String[] args){
    int SIZE = 1000;
    PermutationByBit p = new PermutationByBit(SIZE);
    for(int k=0;k<100;k++){
      ArrayListOfIntsWritable a = p.nextPermutation();

      //make sure the permutation is not out of bounds
      for(int i=0;i<SIZE;i++){
        assertTrue(i+"-->"+a.get(i),a.get(i)<SIZE && a.get(i)>=0);
      }

      //make sure each position is included in the permutation exactly once
      int[] positions = new int[SIZE];
      for(int i=0;i<SIZE;i++){
        if(positions[a.get(i)]==1){
          fail("Same position included twice: "+a.get(i));
        }
        positions[a.get(i)]=1;
      }
      for(int i=0;i<SIZE;i++){
        if(positions[i]==0){
          //          System.out.println(java.util.Arrays.binarySearch(positions, i));
          fail("Position not included: "+i);
View Full Code Here

          int fDocno = key.getRightElement();
          fDocno -= 1000000000;
          int eDocno = key.getLeftElement();
          if(langID == CLIRUtils.E){
            if(!pwsimMapping.containsKey(eDocno)){
              pwsimMapping.put(eDocno, new ArrayListOfIntsWritable());
            }
            pwsimMapping.get(eDocno).add(fDocno);   // we add 1000000000 to foreign docnos to distinguish them during pwsim algo
          }else{
            if(!pwsimMapping.containsKey(fDocno)){
              pwsimMapping.put(fDocno, new ArrayListOfIntsWritable());
            }
            pwsimMapping.get(fDocno).add(eDocno);   // we add 1000000000 to foreign docnos to distinguish them during pwsim algo
          }
          key = (PairOfInts) reader.getKeyClass().newInstance();
          value = (IntWritable) reader.getValueClass().newInstance();
View Full Code Here

      String lang = p.getLanguage();
      int langID;

      ArrayListWritable<Text> sentences;
      ArrayListWritable<HMapSFW> vectors = new ArrayListWritable<HMapSFW>();
      ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable();
      try {
        // identify sentences in document, filter out ones below MinSentLength threshold
        // convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length
        // filter out sentences for which the vector has less than MinVectorTerms terms
        String article = p.getContent();
        if (lang.equals("en")) {
          sentences = helper.getESentences(article, vectors, sentLengths);   
          langID = CLIRUtils.E;
        }else {
          // Turkish Wiki articles' XML does not encode paragraph breaks
          if (lang.equals("tr")) {
            article = article.replaceAll("\\.", ". ");
          }
          sentences = helper.getFSentences(article, vectors, sentLengths);
          langID = CLIRUtils.F;
        }
      } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException(e);
      }

      // documents with no sentences (after we filter out some by length)
      if (sentences.size() == 0) {
        if (langID == CLIRUtils.E) {
          reporter.incrCounter(Docs.EEmpty, 1)
        }else {
          reporter.incrCounter(Docs.FEmpty, 1);
        }
      }else{
        if (langID == CLIRUtils.E) {
          reporter.incrCounter(Docs.E, 1);
        }else {
          reporter.incrCounter(Docs.F, 1);
        }
      }  

      for (int i = 0; i < sentences.size(); i++) {
        if (langID == CLIRUtils.E) {
          if (helper.getEInVocabRate(sentences.get(i).toString()) < minInVocabRate ) {
            reporter.incrCounter(Sentences.OOV, 1);
            return;
          }         
          reporter.incrCounter(Sentences.ELength, sentLengths.get(i));
          reporter.incrCounter(Sentences.E, 1);   
        }else {
          if (helper.getFInVocabRate(sentences.get(i).toString()) < minInVocabRate ) {
            reporter.incrCounter(Sentences.OOV, 1);
            return;
          }
          reporter.incrCounter(Sentences.FLength, sentLengths.get(i));
          reporter.incrCounter(Sentences.F, 1);   
        }
        keyOut.set(docno, langID);     
        valOut.set(langID, sentences.get(i), vectors.get(i));
        output.collect(keyOut, valOut);
View Full Code Here

    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, new Path(randomPermFile),
        IntWritable.class, ArrayListOfIntsWritable.class);
    Permutation p = new PermutationByBit(numBits);
    for (int i = 0; i < numOfPermutations; i++) {
      ArrayListOfIntsWritable perm = p.nextPermutation();
      writer.append(new IntWritable(i), perm);
      sLogger.debug(i + ":" + perm);
    }
    writer.close();
    sLogger.info("Random permutations written.");
View Full Code Here

    System.out.println(s);

    int loopcnt = 0;
    MinhashSignature permutedS = new MinhashSignature(D);
    while (loopcnt++ < 10) {
      ArrayListOfIntsWritable a = p.nextPermutation();
      s.perm(a, permutedS);
      for (int i = 0; i < s.size(); i++) {
        assertTrue(permutedS.containsTerm(s.get(i)));
      }
      assertTrue(permutedS.size() == s.size());
View Full Code Here

        if (!anchor.isExternalInLink() && !anchor.isInternalInLink()) {
          continue;
        }

        keyOut.set(anchor.getText());
        anchorTextTarget.setSources(new ArrayListOfIntsWritable(anchor.getDocuments()));
        anchorTextTarget.setWeight(weightingScheme.getWeight(key.get(), anchor));
        output.collect(keyOut, anchorTextTarget);
      }
    }
View Full Code Here

    }
    System.out.println(s);

    PermutationByBit p = new PermutationByBit(64);
    SixtyFourBitSignature permutedS = new SixtyFourBitSignature(64);
    ArrayListOfIntsWritable a;
    int loopcnt = 0;
    while (loopcnt++ < 1000000) {
        a = p.nextPermutation();
        s.perm(a, permutedS);
       
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.array.ArrayListOfIntsWritable

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.