Package uk.ac.cam.ch.wwmm.ptc.experimental.ngramtfdf

Examples of uk.ac.cam.ch.wwmm.ptc.experimental.ngramtfdf.NGramTfDf


  public static void tfIdfAnalyseCluster(Map<Integer,Float> cluster, IndexReader ir) throws Exception {
    List<File> clusterFiles = new ArrayList<File>();
    for(Integer i : cluster.keySet()) {
      clusterFiles.add(new File(ir.document(i).getField("filename").stringValue().replaceAll("markedup", "source")));
    }
    NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
    ngtd.calculateNGrams();
    Bag<String> tf = ngtd.getDfBag(1);
    tf.discardInfrequent(2);
    Map<String,Double> tfIdf = new HashMap<String,Double>();
    int numDocs = ir.numDocs();
    IndexSearcher is = new IndexSearcher(ir);
    for(String s : tf.getSet()) {
View Full Code Here


            onts.add(termArray[j]);
          }       
        }       
      }
    }
    NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
    ngtd.calculateNGrams();
    Bag<String> df = ngtd.getDfBag(1);
    df.discardInfrequent(2);
    Map<String,Double> scores = new HashMap<String,Double>();
    int numDocs = ir.numDocs();
    int clusterSize = cluster.size();
    double scaleFactor = clusterSize * 1.0 / numDocs;
    IndexSearcher is = new IndexSearcher(ir);
    for(String s : df.getSet()) {
      //System.out.println(s);
      int docFreq = 0;
      Query q;
      if(s.matches("\\S+")) {
        TermQuery tq = new TermQuery(new Term("txt", s));
        q = tq;
        //docFreq = ir.docFreq(new Term("txt", s));
      } else {
        PhraseQuery pq = new PhraseQuery();
        for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
        q = pq;
      }
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      docFreq = vc.getResultsVector().size();
      double score;
      double expected = scaleFactor * docFreq;
      double excess = df.getCount(s) - expected;
      score = excess / clusterSize;       
      if(score > threshold) scores.put(s, score);
    }
    Stemmer st = new Stemmer(new EnglishStemmer());
    Map<String,List<String>> stems = st.wordsToStems(df.getSet());
    for(String stem : stems.keySet()) {
      List<String> words = stems.get(stem);
      if(words.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String word : words) {
          bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(stems.get(stem).toString(), overlap);
          scores.put(stems.get(stem).toString(), score);
        }
      }
    }
    Map<String,List<String>> termStems = ngtd.ngramsByStem();
    for(String stem : termStems.keySet()) {
      List<String> multiWords = termStems.get(stem);
      if(multiWords.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String multiWord : multiWords) {
View Full Code Here

    }
    Stemmer st = new Stemmer(new EnglishStemmer());
    Map<String,List<String>> stems = st.wordsToStems(dfs.getSet());

    dfs.discardInfrequent(2);
    NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
    ngtd.calculateNGrams();
    Bag<String> bs = ngtd.getDfBag(2);
    bs.discardInfrequent(2);
    Map<String,List<String>> termStems = ngtd.ngramsByStem();

    Map<String,Double> scores = new HashMap<String,Double>();
    Map<String,Integer> overlaps = new HashMap<String,Integer>();
    IndexSearcher is = new IndexSearcher(ir);
    int docTotal = ir.numDocs();
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.ptc.experimental.ngramtfdf.NGramTfDf

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.