Package org.apache.lucene.index

Examples of org.apache.lucene.index.TermFreqVector


  public static Map<String,Double> simpleExcessAnalyseCluster(Map<Integer,Float> cluster, IndexReader ir, double threshold) throws Exception {
    List<File> clusterFiles = new ArrayList<File>();
    Bag<String> df = new Bag<String>();
    for(Integer i : cluster.keySet()) {
      clusterFiles.add(new File(ir.document(i).getField("filename").stringValue().replaceAll("markedup", "source")));
      TermFreqVector tvf = ir.getTermFreqVector(i, "txt");
      if(tvf != null) {
        String [] termArray = tvf.getTerms();
        for(int j=0;j<termArray.length;j++) {
          String term = termArray[j];
          if(!TermSets.getClosedClass().contains(term) && term.matches(".*[A-Za-z].*")) df.add(term);
        }       
      }       
View Full Code Here


    Bag<String> dfs = new Bag<String>();
    Set<String> inchis = new HashSet<String>();
    Set<String> onts = new HashSet<String>();
    for(Integer i : cluster.keySet()) {
      cluster.put(i, 1.0f);
      TermFreqVector tvf = ir.getTermFreqVector(i, "txt");
      String [] termArray = tvf.getTerms();
      for(int j=0;j<termArray.length;j++) {
        dfs.add(termArray[j]);
      }
      if(false) {
        tvf = ir.getTermFreqVector(i, "InChI");
        if(tvf != null) {
          termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            inchis.add(termArray[j]);
          }       
        }
        tvf = ir.getTermFreqVector(i, "Ontology");
        if(tvf != null) {
          termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            onts.add(termArray[j]);
          }       
        }       
      }
View Full Code Here

    System.out.println("Terms: " + dfl.size());
   
    SVDHarness svdh = new SVDHarness(dfl.size(), numDocs);
   
    for(int i=0;i<ir.numDocs();i++) {
      TermFreqVector tv = ir.getTermFreqVector(i, "txt");
      String [] terms = tv.getTerms();
      int [] counts = tv.getTermFrequencies();
      double termTotal = 0.0;
      for(int j=0;j<tv.size();j++) {
        termTotal += counts[j];
      }
      for(int j=0;j<tv.size();j++) {
        if(termIndex.containsKey(terms[j])) {
          svdh.set(i, termIndex.get(terms[j]), counts[j] * (1.0 / termTotal) * Math.log(numDocs * 1.0 / docFreqs.get(terms[j])));
        }
      }
    }
View Full Code Here

  private SimilarityMatrix makeSimilarityMatrix() throws Exception {
    Set<String> inchis = new HashSet<String>();
    IndexSearcher is = lis.getIndexSearcher();
    IndexReader ir = is.getIndexReader();
    for(int i=0;i<ir.numDocs();i++) {
      TermFreqVector tfv = ir.getTermFreqVector(i, "InChI");
      if(tfv != null) {
        String [] terms = tfv.getTerms();
        for(int j=0;j<terms.length;j++) inchis.add(terms[j]);
      }
    }
    ir.close();
    return new SimilarityMatrix(inchis);
View Full Code Here

    //return coWeightSum / (Math.sqrt(weightSumSquare1) * Math.sqrt(weightSumSquare2));
  }

  private static Map<String,Double> getTfIdfVector(IndexReader ir, int doc, String field) throws Exception {
    Map<String,Double> tfIdf = new HashMap<String,Double>();
    TermFreqVector tv = ir.getTermFreqVector(doc, field);
    double termTotal = 0.0;
    if(tv == null) return tfIdf;
    String [] terms = tv.getTerms();
    int [] counts = tv.getTermFrequencies();
    double docTotal = ir.numDocs();
    for(int i=0;i<tv.size();i++) {
      termTotal += counts[i];
    }
    for(int i=0;i<tv.size();i++) {
      String term = terms[i].intern();
      double tf = counts[i] / termTotal;
      double idf = Math.log(docTotal / ir.docFreq(new Term(field, term)));
      tfIdf.put(term, tf * idf);
      //tfIdf.put(term, tf);
View Full Code Here

    }
  }
 
  private static Map<String,Double> getTfIdfVector(IndexReader ir, int doc, String field) throws Exception {
    Map<String,Double> tfIdf = new HashMap<String,Double>();
    TermFreqVector tv = ir.getTermFreqVector(doc, field);
    double termTotal = 0.0;
    if(tv == null) return tfIdf;
    String [] terms = tv.getTerms();
    int [] counts = tv.getTermFrequencies();
    double docTotal = ir.numDocs();
    for(int i=0;i<tv.size();i++) {
      termTotal += counts[i];
    }
    for(int i=0;i<tv.size();i++) {
      String term = terms[i].intern();
      double tf = counts[i] / termTotal;
      //double idf = 1.0;
      double idf = Math.log(docTotal / ir.docFreq(new Term(field, term)));
      //double idf = Math.log(docTotal / getDocFreq(ir, field, term));
View Full Code Here

      f.add( "docFreq", reader.docFreq( t ) ); // this can be 0 for non-indexed fields
           
      // If we have a term vector, return that
      if( fieldable.isTermVectorStored() ) {
        try {
          TermFreqVector v = reader.getTermFreqVector( docId, fieldable.name() );
          if( v != null ) {
            SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
            for( int i=0; i<v.size(); i++ ) {
              tfv.add( v.getTerms()[i], v.getTermFrequencies()[i] );
            }
            f.add( "termVector", tfv );
          }
        }
        catch( Exception ex ) {
View Full Code Here

    return doc;
  }
 
  private List<TermVectorOffsetInfo> getOffsets(IndexReader ir, int docNo, String fieldName, Collection<String> words) throws Exception {
    List<TermVectorOffsetInfo> results = new ArrayList<TermVectorOffsetInfo>();
    TermFreqVector tfv = ir.getTermFreqVector(docNo, fieldName);
    if(tfv instanceof TermPositionVector) {
      TermPositionVector tpv = (TermPositionVector)tfv;
      String [] terms = tpv.getTerms();
      for(int i=0;i<tpv.getTerms().length;i++) {
        if(words.contains(terms[i])) {
View Full Code Here

  public static Event docToEvent(IndexReader ir, int doc, String cue) throws Exception {
    Stemmer st = new Stemmer(new EnglishStemmer());
    List<String> words = new ArrayList<String>();
    boolean hasCue = false;
    TermFreqVector tvf = ir.getTermFreqVector(doc, "txt");
    String [] termArray = tvf.getTerms();
    int [] termFreqs = tvf.getTermFrequencies();
    for(int j=0;j<termArray.length;j++) {
      if(TermSets.getClosedClass().contains(termArray[j])) {
        //ignore
      } else if(termArray[j].equals(cue)) {
        hasCue = true;
View Full Code Here

        if(t.field().equals(fieldName)) ptt.add(t.text());
      }
    }
   
    List<TermVectorOffsetInfo> results = new ArrayList<TermVectorOffsetInfo>();
    TermFreqVector tfv = ir.getTermFreqVector(docNo, fieldName);
    Map<String,Integer> termToID = new HashMap<String,Integer>();
    if(tfv instanceof TermPositionVector) {
      TermPositionVector tpv = (TermPositionVector)tfv;
      String [] terms = tpv.getTerms();
      for(int i=0;i<tpv.getTerms().length;i++) {
View Full Code Here

TOP

Related Classes of org.apache.lucene.index.TermFreqVector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.