Package org.apache.lucene.index

Examples of org.apache.lucene.index.TermFreqVector


     */
    public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
        Map<String,Int> termFreqMap = new HashMap<String,Int>();
        for (int i = 0; i < fieldNames.length; i++) {
            String fieldName = fieldNames[i];
            TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);

            // field does not store term vector info
            if (vector == null) {
              Document d=ir.document(docNum);
              String text[]=d.getValues(fieldName);
View Full Code Here


                        }
                    }
                }
                separator = " ";
            }
            TermFreqVector tfv = reader.getTermFreqVector(
                    docNumber, FieldNames.FULLTEXT);
            if (tfv instanceof TermPositionVector) {
                return createExcerpt((TermPositionVector) tfv, text.toString(),
                        maxFragments, maxFragmentSize);
            } else {
View Full Code Here

    @Override
    public TermFreqVector getTermFreqVector(int docNum, String field) throws IOException
    {

        TermFreqVector termVector = new lucandra.TermFreqVector(getIndexName(), field, docNum);

        return termVector;
    }
View Full Code Here

      f.add( "docFreq", t.text()==null ? 0 : reader.docFreq( t ) ); // this can be 0 for non-indexed fields
           
      // If we have a term vector, return that
      if( fieldable.isTermVectorStored() ) {
        try {
          TermFreqVector v = reader.getTermFreqVector( docId, fieldable.name() );
          if( v != null ) {
            SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
            for( int i=0; i<v.size(); i++ ) {
              tfv.add( v.getTerms()[i], v.getTermFrequencies()[i] );
            }
            f.add( "termVector", tfv );
          }
        }
        catch( Exception ex ) {
View Full Code Here

      f.add( "docFreq", reader.docFreq( t ) ); // this can be 0 for non-indexed fields
           
      // If we have a term vector, return that
      if( fieldable.isTermVectorStored() ) {
        try {
          TermFreqVector v = reader.getTermFreqVector( docId, fieldable.name() );
          if( v != null ) {
            SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
            for( int i=0; i<v.size(); i++ ) {
              tfv.add( v.getTerms()[i], v.getTermFrequencies()[i] );
            }
            f.add( "termVector", tfv );
          }
        }
        catch( Exception ex ) {
View Full Code Here

      is.search(tq, vc);
      float vcs = vc.getResultsVector().size();
      Map<Integer,Float> stf = new HashMap<Integer,Float>();
     
      for(Integer j : vc.getResultsVector().keySet()) {
        TermFreqVector tv = ir.getTermFreqVector(j, "txt");
        String [] terms = tv.getTerms();
        for(int k=0;k<tv.size();k++) {
          String term = terms[k];
          if(termMap.containsKey(term)) {
            int termId = termMap.get(term);
            if(!stf.containsKey(termId)) stf.put(termId, 0.0f);
            stf.put(termId, stf.get(termId) + 1.0f/vcs);           
View Full Code Here

    //return coWeightSum / (Math.sqrt(weightSumSquare1) * Math.sqrt(weightSumSquare2));
  }

  private static Map<String,Double> getTfIdfVector(IndexReader ir, int doc, String field) throws Exception {
    Map<String,Double> tfIdf = new HashMap<String,Double>();
    TermFreqVector tv = ir.getTermFreqVector(doc, field);
    double termTotal = 0.0;
    if(tv == null) return tfIdf;
    String [] terms = tv.getTerms();
    int [] counts = tv.getTermFrequencies();
    double docTotal = ir.numDocs();
    for(int i=0;i<tv.size();i++) {
      String term = terms[i];
      //if(!term.matches(".*[A-Za-z].*")) continue;
      termTotal += counts[i];
    }
    for(int i=0;i<tv.size();i++) {
      String term = terms[i].intern();
      //if(!term.matches(".*[A-Za-z].*")) continue;
      double tf = counts[i] / termTotal;
      double idf = Math.log(docTotal / ir.docFreq(new Term(field, term)));
      tfIdf.put(term, tf * idf);
 
View Full Code Here

      Hits h = is.search(getQuery());
      if(h.length() < 20) return null;
      IndexReader ir = is.getIndexReader();
      Bag<String> termBag = new Bag<String>();
      for(int i=0;i<h.length();i++) {
        TermFreqVector tv = ir.getTermFreqVector(h.id(i), "txt");
        String [] terms = tv.getTerms();
        for(int k=0;k<tv.size();k++) {
          String term = terms[k];
          if("In".equals(term)) continue;
          if(TermSets.getClosedClass().contains(term)) continue;
          if(!term.matches(".*[A-Za-z].*")) continue;
          if(!includeTerms.contains(term) && !excludeTerms.contains(term)) termBag.add(terms[k]);
View Full Code Here

    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexSearcher is = lis.getIndexSearcher();
    IndexReader ir = lis.getIndexReader();
    Bag<String> tfBag = new Bag<String>();
    for(int i=0;i<ir.maxDoc();i++) {
      TermFreqVector tv = ir.getTermFreqVector(i, "txt");
      String [] terms = tv.getTerms();
      int [] freqs = tv.getTermFrequencies();
      for(int k=0;k<tv.size();k++) {
        String term = terms[k];
        if("In".equals(term)) continue;
        if(TermSets.getClosedClass().contains(term)) continue;
        if(!term.matches(".*[A-Za-z].*")) continue;
        tfBag.add(term, freqs[k]);
      }
    }
    double overallEntropy = tfBag.entropy();
    double totalEntropy = overallEntropy * tfBag.totalCount();
    System.out.println(totalEntropy);
   
    List<String> termList = tfBag.getList().subList(0, 2000);
    for(String splitTerm : termList) {
      Query q = new TermQuery(new Term("txt", splitTerm));
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      Bag<String> inBag = new Bag<String>();
      Bag<String> outBag = new Bag<String>();
      for(int i=0;i<ir.maxDoc();i++) {
        Bag<String> bag = inBag;
        if(!vc.getResultsVector().containsKey(i)) continue;
       
        //Bag<String> bag = outBag;
        //if(vc.getResultsVector().containsKey(i)) bag = inBag;
        TermFreqVector tv = ir.getTermFreqVector(i, "txt");
        String [] terms = tv.getTerms();
        int [] freqs = tv.getTermFrequencies();
        for(int k=0;k<tv.size();k++) {
          String term = terms[k];
          if("In".equals(term)) continue;
          if(TermSets.getClosedClass().contains(term)) continue;
          if(!term.matches(".*[A-Za-z].*")) continue;
          bag.add(term, freqs[k]);
View Full Code Here

   
    List<File> clusterFiles = new ArrayList<File>();
    for(Integer i : cluster.keySet()) {
      clusterFiles.add(new File(ir.document(i).getField("filename").stringValue().replaceAll("markedup", "source")));
      if(enriched) {
        TermFreqVector tvf = ir.getTermFreqVector(i, "InChI");
        if(tvf != null) {
          String [] termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            inchis.add(termArray[j]);
          }       
        }
        tvf = ir.getTermFreqVector(i, "Ontology");
        if(tvf != null) {
          String [] termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            onts.add(termArray[j]);
          }       
        }       
      }
View Full Code Here

TOP

Related Classes of org.apache.lucene.index.TermFreqVector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.