Package edu.umd.cloud9.collection.wikipedia

Examples of edu.umd.cloud9.collection.wikipedia.WikipediaPage


    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(enwikiEn + "/test_wt-term-doc-vectors/part-00000")));
    reader.next(key, value);
    verifyTermDocVector(enTermDocVector1, value);
View Full Code Here


    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(dewikiEn + "/test_wt-term-doc-vectors/part-00000")));
    reader.next(key, value);
    verifyTermDocVector(deTermDocVector1, value);
View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(enwikiEn + "/test_wt-term-doc-vectors/part-00000")));
    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(enTermDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(enTermDocVector2, value);
    reader.close();
  }
View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(dewikiEn + "/test_wt-term-doc-vectors/part-00000")));
    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(deTermDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(deTermDocVector2, value);
    reader.close();
  }
View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(galagoIndex + "/test_wt-term-doc-vectors/part-00000")));

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(galagoTermDocVector1, value);

    reader.next(key, value);

    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }

    verifyTermDocVector(galagoTermDocVector2, value);
    reader.close();
View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(opennlpIndex + "/test_wt-term-doc-vectors/part-00000")));

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(opennlpTermDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(opennlpTermDocVector2, value);
    reader.close();
  }
View Full Code Here

      sLogger = logger;
    }

    //sLogger.setLevel(Level.DEBUG);

    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(int e : tfTable.keySet()){
      // retrieve term string, tf and df
      String eTerm = eVocab.get(e);
      float tf = tfTable.get(e);
      float df = dfTable.get(e);

      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);

      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);
      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }   
      }
    }

    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }
View Full Code Here

      sLogger = logger;
    }

    //sLogger.setLevel(Level.DEBUG);

    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(int e : tfTable.keySet()){
      // retrieve term string, tf and df
      String eTerm = eVocab.get(e);
      float tf = tfTable.get(e);
      float df = dfTable.get(eTerm);

      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);

      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);
      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }  
      }
    }

    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }
View Full Code Here

      sLogger = logger;
    }

    //    sLogger.setLevel(Level.DEBUG);

    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(edu.umd.cloud9.util.map.MapIF.Entry entry : tfTable.entrySet()){
      // retrieve term string, tf and df
      String eTerm = eVocab.get(entry.getKey());
      float tf = entry.getValue();
      int eId = dict.getId(eTerm);
      if(eId < 1){    //OOV
        continue;
      }
      int df = dfTable.getDf(eId);
      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);
      if(df<1){
        sLogger.warn("Suspicious DF WARNING = "+eTerm+" "+tf+" "+df+" "+score);
      }

      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);

      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }   
      }
    }

    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }
View Full Code Here

      sLogger = logger;
    }

    //    sLogger.setLevel(Level.DEBUG);

    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(edu.umd.cloud9.util.map.MapKI.Entry<String> entry : tfTable.entrySet()){
      // retrieve term string, tf and df
      String eTerm = entry.getKey();
      float tf = entry.getValue();
      int eId = dict.getId(eTerm);
      if(eId < 1){    //OOV
        continue;
      }
      int df = dfTable.getDf(eId);
      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);
      if(df<1){
        sLogger.warn("Suspicious DF WARNING = "+eTerm+" "+tf+" "+df+" "+score);
      }

      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);

      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }  
      }
    }

    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.collection.wikipedia.WikipediaPage

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.