Examples of VocabularyWritable

edu.umd.hooka.VocabularyWritable

Examples of edu.umd.hooka.VocabularyWritable

    testOOV("zh", vocab, false, false, zhExpectedOOVRates);    
  }


  @Test
  public void testTurkishOOVs() {
    VocabularyWritable vocab = new VocabularyWritable();
    List<String> sentences = readInput(dir + "data/tokenizer/test/tr-test.tok.stemmed.stop");
    for (String token : sentences.get(3).split(" ")) {
      vocab.addOrGet(token);
    }
    vocab.addOrGet("ispanyol");
    vocab.addOrGet("isim");
    vocab.addOrGet("10");


    float[] trStopStemExpectedOOVRates = {0.85714287f, 1f, 0.6f, 0f};
    float[] trStopExpectedOOVRates = {1f, 1f, 0.8f, 0.5f};
    float[] trStemExpectedOOVRates = {0.85714287f, 1f, 0.71428573f, 0.33333334f};
    float[] trExpectedOOVRates = {1f, 1f, 0.85714287f, 0.6666667f};

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    testOOV("tr", vocab, false, false, trExpectedOOVRates);    
  }


  @Test
  public void testArabicOOVs() {
    VocabularyWritable vocab = new VocabularyWritable();
    List<String> sentences = readInput(dir + "data/tokenizer/test/ar-test.tok.stemmed.stop");
    for (String token : sentences.get(0).split(" ")) {
      vocab.addOrGet(token);
    }
    vocab.addOrGet("2011");
    float[] arStopStemExpectedOOVRates = {0f, 1f, 0.8181818f, 1f};
    float[] arStopExpectedOOVRates = {0.6666667f, 1f, 0.8181818f, 1f};
    float[] arStemExpectedOOVRates = {0f, 1f, 0.85714287f, 1f};
    float[] arExpectedOOVRates = {0.6666667f, 1f, 0.85714287f, 1f};

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    testOOV("ar", vocab, false, false, arExpectedOOVRates);
  }


  @Test
  public void testEnglishOOVs() {
    VocabularyWritable vocab = new VocabularyWritable();
    vocab.addOrGet("r.d.");
    vocab.addOrGet("craig");
    vocab.addOrGet("dictionari");
    vocab.addOrGet("polynesian");
    vocab.addOrGet("mytholog");
    vocab.addOrGet("greenwood");
    vocab.addOrGet("press");
    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");


    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
    float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};
    float[] enStemExpectedOOVRates = {1f, 36/37f, 15/18.0f, 7/19f};
    float[] enExpectedOOVRates = {1f, 36/37f, 15/18.0f, 9/19f};

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    String stemmedStopwordsFile = conf.get(Constants.StemmedStopwordList);
    stemmedStopwords = readInput(fs, stemmedStopwordsFile);
    isStopwordRemoval = !stopwords.isEmpty();
    isStemming = conf.getBoolean(Constants.Stemming, true);


    VocabularyWritable vocab;
    try {
      vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.CollectionVocab)), fs);
      setVocab(vocab);
    } catch (Exception e) {
      LOG.warn("No vocabulary provided to tokenizer.");

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    }
  }


  @Test
  public void testChineseOOVs() {
    VocabularyWritable vocab = new VocabularyWritable();
    List<String> sentences = readInput(dir + "data/tokenizer/test/zh-test.tok.stemmed.stop");
    for (String token : sentences.get(3).split(" ")) {
      vocab.addOrGet(token);
    }
    vocab.addOrGet("1457");
    vocab.addOrGet("19");


    float[] zhExpectedOOVRates = {0.6666667f, 0.8666667f, 0.72727275f, 0f};     // all same since no stemming or stopword removal
 
    testOOV("zh", vocab, true, true, zhExpectedOOVRates);
    testOOV("zh", vocab, false, true, zhExpectedOOVRates);

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    testOOV("zh", vocab, false, false, zhExpectedOOVRates);    
  }


  @Test
  public void testTurkishOOVs() {
    VocabularyWritable vocab = new VocabularyWritable();
    List<String> sentences = readInput(dir + "data/tokenizer/test/tr-test.tok.stemmed.stop");
    for (String token : sentences.get(3).split(" ")) {
      vocab.addOrGet(token);
    }
    vocab.addOrGet("ispanyol");
    vocab.addOrGet("isim");
    vocab.addOrGet("10");


    float[] trStopStemExpectedOOVRates = {0.85714287f, 1f, 0.6f, 0f};
    float[] trStopExpectedOOVRates = {1f, 1f, 0.8f, 0.5f};
    float[] trStemExpectedOOVRates = {0.85714287f, 1f, 0.71428573f, 0.33333334f};
    float[] trExpectedOOVRates = {1f, 1f, 0.85714287f, 0.6666667f};

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    testOOV("tr", vocab, false, false, trExpectedOOVRates);    
  }


  @Test
  public void testArabicOOVs() {
    VocabularyWritable vocab = new VocabularyWritable();
    List<String> sentences = readInput(dir + "data/tokenizer/test/ar-test.tok.stemmed.stop");
    for (String token : sentences.get(0).split(" ")) {
      vocab.addOrGet(token);
    }
    vocab.addOrGet("2011");
    float[] arStopStemExpectedOOVRates = {0f, 1f, 0.8181818f, 1f};
    float[] arStopExpectedOOVRates = {0.6666667f, 1f, 0.8181818f, 1f};
    float[] arStemExpectedOOVRates = {0f, 1f, 0.85714287f, 1f};
    float[] arExpectedOOVRates = {0.6666667f, 1f, 0.85714287f, 1f};

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    testOOV("ar", vocab, false, false, arExpectedOOVRates);
  }


  @Test
  public void testEnglishOOVs() {
    VocabularyWritable vocab = new VocabularyWritable();
    vocab.addOrGet("r.d.");
    vocab.addOrGet("craig");
    vocab.addOrGet("dictionari");
    vocab.addOrGet("polynesian");
    vocab.addOrGet("mytholog");
    vocab.addOrGet("greenwood");
    vocab.addOrGet("press");
    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");


    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
    float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};
    float[] enStemExpectedOOVRates = {1f, 36/37f, 15/18.0f, 7/19f};
    float[] enExpectedOOVRates = {1f, 36/37f, 15/18.0f, 9/19f};

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    stopwords = readInput(fs, stopwordsFile);      
    String stemmedStopwordsFile = conf.get(Constants.StemmedStopwordList);
    stemmedStopwords = readInput(fs, stemmedStopwordsFile);
    isStopwordRemoval = !stopwords.isEmpty();
    
    VocabularyWritable vocab;
    try {
      vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.CollectionVocab)), fs);
      setVocab(vocab);
    } catch (Exception e) {
      LOG.warn("No vocabulary provided to tokenizer.");

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    String stopwordsFile = conf.get(Constants.StopwordList);
    stopwords = readInput(fs, stopwordsFile);      
    String stemmedStopwordsFile = conf.get(Constants.StemmedStopwordList);
    stemmedStopwords = readInput(fs, stemmedStopwordsFile);


    VocabularyWritable vocab;
    try {
      vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.CollectionVocab)), fs);
      setVocab(vocab);
    } catch (Exception e) {
      sLogger.warn("No vocabulary provided to tokenizer.");

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.