Package edu.umd.cloud9.io.array

Examples of edu.umd.cloud9.io.array.ArrayListOfIntsWritable


    int cntBits = s.countSetBits();

    int loopcnt = 0;
    NBitSignature permutedS = new NBitSignature(D);
    while (loopcnt++ < 100) {
      ArrayListOfIntsWritable a = p.nextPermutation();
      s.perm(a, permutedS);
      System.out.println(permutedS.countSetBits());
      assertTrue(cntBits == permutedS.countSetBits());
      // System.out.println(permutedS);
    }
View Full Code Here


    int loopcnt = 0;
    NBitSignature permutedS = new NBitSignature(D);
    while (true) {
      try {
        ArrayListOfIntsWritable a = p.nextPermutation();
        s.perm(a, permutedS);
        System.out.println((loopcnt++) + "\n" + permutedS);
        assertTrue(cntBits == permutedS.countSetBits());
        // System.out.println(permutedS);
      } catch (RuntimeException e) {
View Full Code Here

  // }

  @Test
  public void testSubSignature() {
    PermutationByBit p = new PermutationByBit(D);
    ArrayListOfIntsWritable a = p.nextPermutation();

    for (int i = 0; i < 100000; i++) {

      NBitSignature s = getRandomSignature();
View Full Code Here

    System.out.println(s);

    int loopcnt = 0;
    MinhashSignature permutedS = new MinhashSignature(D);
    while (loopcnt++ < 10) {
      ArrayListOfIntsWritable a = p.nextPermutation();
      s.perm(a, permutedS);
      for (int i = 0; i < s.size(); i++) {
        assertTrue(permutedS.containsTerm(s.get(i)));
      }
      assertTrue(permutedS.size() == s.size());
View Full Code Here

  /**
   * Constructs a target with an empty source list.
   */
  public AnchorTextTarget() {
    sources = new ArrayListOfIntsWritable();
  }
View Full Code Here

          int fDocno = key.getRightElement();
          fDocno -= 1000000000;
          int eDocno = key.getLeftElement();
          if(langID == CLIRUtils.E){
            if(!pwsimMapping.containsKey(eDocno)){
              pwsimMapping.put(eDocno, new ArrayListOfIntsWritable());
            }
            pwsimMapping.get(eDocno).add(fDocno);   // we add 1000000000 to foreign docnos to distinguish them during pwsim algo
          }else{
            if(!pwsimMapping.containsKey(fDocno)){
              pwsimMapping.put(fDocno, new ArrayListOfIntsWritable());
            }
            pwsimMapping.get(fDocno).add(eDocno);   // we add 1000000000 to foreign docnos to distinguish them during pwsim algo
          }
          key = (PairOfInts) reader.getKeyClass().newInstance();
          value = (IntWritable) reader.getValueClass().newInstance();
View Full Code Here

      String lang = p.getLanguage();
      int langID;

      ArrayListWritable<Text> sentences;
      ArrayListWritable<HMapSFW> vectors = new ArrayListWritable<HMapSFW>();
      ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable();
      try {
        // identify sentences in document, filter out ones below MinSentLength threshold
        // convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length
        // filter out sentences for which the vector has less than MinVectorTerms terms
        String article = p.getContent();
        if (lang.equals("en")) {
          sentences = helper.getESentences(article, vectors, sentLengths);   
          langID = CLIRUtils.E;
        }else {
          // Turkish Wiki articles' XML does not encode paragraph breaks
          if (lang.equals("tr")) {
            article = article.replaceAll("\\.", ". ");
          }
          sentences = helper.getFSentences(article, vectors, sentLengths);
          langID = CLIRUtils.F;
        }
      } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException(e);
      }

      // documents with no sentences (after we filter out some by length)
      if (sentences.size() == 0) {
        if (langID == CLIRUtils.E) {
          reporter.incrCounter(Docs.EEmpty, 1)
        }else {
          reporter.incrCounter(Docs.FEmpty, 1);
        }
      }else{
        if (langID == CLIRUtils.E) {
          reporter.incrCounter(Docs.E, 1);
        }else {
          reporter.incrCounter(Docs.F, 1);
        }
      }  

      for (int i = 0; i < sentences.size(); i++) {
        float inVocabRate = 0;
        if (langID == CLIRUtils.E) {
          if (helper.getEInVocabRate(sentences.get(i).toString()) < minInVocabRate ) {
            reporter.incrCounter(Sentences.OOV, 1);
            return;
          }         
          reporter.incrCounter(Sentences.ELength, sentLengths.get(i));
          reporter.incrCounter(Sentences.E, 1);   
        }else {
          if (helper.getFInVocabRate(sentences.get(i).toString()) < minInVocabRate ) {
            reporter.incrCounter(Sentences.OOV, 1);
            return;
          }
          reporter.incrCounter(Sentences.FLength, sentLengths.get(i));
          reporter.incrCounter(Sentences.F, 1);   
        }
        keyOut.set(docno, langID);     
        valOut.set(langID, sentences.get(i), vectors.get(i));
        output.collect(keyOut, valOut);
View Full Code Here

        if (!anchor.isExternalInLink() && !anchor.isInternalInLink()) {
          continue;
        }

        keyOut.set(anchor.getText());
        anchorTextTarget.setSources(new ArrayListOfIntsWritable(anchor.getDocuments()));
        anchorTextTarget.setWeight(weightingScheme.getWeight(key.get(), anchor));
        output.collect(keyOut, anchorTextTarget);
      }
    }
View Full Code Here

              int fDocno = key.getRightElement();
//          fDocno -= 1000000000;
              int eDocno = key.getLeftElement();
              if(langID == CLIRUtils.E){
                if(!pwsimMapping.containsKey(eDocno)){
                  pwsimMapping.put(eDocno, new ArrayListOfIntsWritable());
                }
                pwsimMapping.get(eDocno).add(fDocno);   // we add 1000000000 to foreign docnos to distinguish them during pwsim algo
              }else{
                if(!pwsimMapping.containsKey(fDocno)){
                  pwsimMapping.put(fDocno, new ArrayListOfIntsWritable());
                }
                pwsimMapping.get(fDocno).add(eDocno);   // we add 1000000000 to foreign docnos to distinguish them during pwsim algo
              }
              cnt++;
              key = (PairOfInts) reader.getKeyClass().newInstance();
View Full Code Here

      String lang = p.getLanguage();
      int langID;

      ArrayListWritable<Text> sentences;
      ArrayListWritable<HMapSFW> vectors = new ArrayListWritable<HMapSFW>();
      ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable();
      // identify sentences in document, filter out ones below MinSentLength threshold
      // convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length
      // filter out sentences for which the vector has less than MinVectorTerms terms
      try {
        String article = p.getContent();
        if (lang.equals("en")) {
          sentences = helper.getESentences(article, vectors, sentLengths);   
          langID = CLIRUtils.E;
        }else {
          // Turkish Wiki articles' XML does not encode paragraph breaks
          if (lang.equals("tr")) {
            article = article.replaceAll("\\.", ". ");
          }
          sentences = helper.getFSentences(article, vectors, sentLengths);
          langID = CLIRUtils.F;
        }
      } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Error in sentence detection for language: " + lang + ", helper: " + helper + " article title: " + p.getTitle());
      }

      // documents with no sentences (after we filter out some by length)
      if (sentences.size() == 0) {
        if (langID == CLIRUtils.E) {
          reporter.incrCounter(Docs.EEmpty, 1)
        }else {
          reporter.incrCounter(Docs.FEmpty, 1);
        }
      }else{
        if (langID == CLIRUtils.E) {
          reporter.incrCounter(Docs.E, 1);
        }else {
          reporter.incrCounter(Docs.F, 1);
        }
      }  

      for (int i = 0; i < sentences.size(); i++) {
        if (langID == CLIRUtils.E) {
          if (helper.getEOOVRate(sentences.get(i).toString()) > maxOOVRate ) {
            reporter.incrCounter(Sentences.OOV, 1);
            continue;
          }         
          reporter.incrCounter(Sentences.ELength, sentLengths.get(i));
          reporter.incrCounter(Sentences.E, 1);   
        }else {
          if (helper.getFOOVRate(sentences.get(i).toString()) > maxOOVRate ) {
            reporter.incrCounter(Sentences.OOV, 1);
            continue;
          }
          reporter.incrCounter(Sentences.FLength, sentLengths.get(i));
          reporter.incrCounter(Sentences.F, 1);   
        }
        keyOut.set(docno, langID);     
        valOut.set(langID, sentences.get(i), vectors.get(i));
        output.collect(keyOut, valOut);
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.array.ArrayListOfIntsWritable

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.