Package org.apache.lucene.index

Examples of org.apache.lucene.index.Term


  }
 
  public String[] suggest(String queryString, IndexReader idxReader, String field){
    List<String> results = new ArrayList<String>(maxSuggestionSize);
    if(queryString != null && queryString.length() >= minQueryStringLength){
      Term term = new Term("t", queryString);
      PrefixQuery prefixQuery = new PrefixQuery(term);
      IndexReader indexReader = null;
      IndexSearcher indexSearch = null;
      try{
        indexReader = searchSuggestIndexer.openSuggestIndexReader();
        indexSearch = new IndexSearcher(indexReader);
        Hits hits = indexSearch.search(prefixQuery);
        int maxNumCandidate = maxSuggestionSize;
        if(idxReader != null && field != null){
          maxNumCandidate = maxSuggestionSize * 10;
        }
        PriorityQueue<SuggestWord> suggestQueue = new PriorityQueue<SuggestWord>(maxNumCandidate);
        for(int i = 0; i < hits.length() && i < maxNumCandidate; i++){
          String sugWord = hits.doc(i).get("t");
          // check if the 'sugWord' matches at least one doc in the
          // source index database (idxReader)
          if(idxReader != null && field != null){
            int freq = idxReader.docFreq(new Term(field,sugWord));
            if(freq > 0){
              suggestQueue.add(new SuggestWord(sugWord,freq));
            }
          }else{
            suggestQueue.add(new SuggestWord(sugWord,0));
View Full Code Here


    Document leadDoc = null;
    try {
      boolean exists = IndexReader.indexExists(indexDirectory);
      if (exists == true) {
        reader = IndexReader.open(indexDirectory);
        Term key = new Term(DocumentCreator.FIELD_URL_MD5, urlHash);
        termDocs = reader.termDocs(key);
        boolean found = false;
        while (termDocs.next() && found == false) {
          int pos = termDocs.doc();
          // use FieldSelector for more efficient loading of Fields.
View Full Code Here

    Document leadDoc = null;
    try {
      boolean exists = IndexReader.indexExists(indexDirectory);
      if (exists == true) {
        reader = IndexReader.open(indexDirectory);
        Term key = new Term(DocumentCreator.FIELD_URL_MD5, urlHash);
        termDocs = reader.termDocs(key);
        boolean found = false;
        while (termDocs.next() && found == false) {
          int pos = termDocs.doc();
          // use FieldSelector for more efficient loading of Fields.
View Full Code Here

    int length = maxDoc + 1;
    TermValueList<T> list = listFactory == null ? (TermValueList<T>) new TermStringList()
        : listFactory.createTermList();
    TermDocs termDocs = reader.termDocs();
    TermEnum termEnum = reader.terms(new Term(field, ""));
    int t = 0; // current term number

    list.add(null);
    minIDList.add(-1);
    maxIDList.add(-1);
    freqList.add(0);
    // int df = 0;
    t++;
    try
    {
      do
      {
        Term term = termEnum.term();
        if (term == null || term.field() != field)
          break;

        if (t > order.maxValue())
        {
          throw new IOException("maximum number of value cannot exceed: "
              + order.maxValue());
        }
        // store term text
        // we expect that there is at most one term per document
        if (t >= length)
          throw new RuntimeException("there are more terms than "
              + "documents in field \"" + field
              + "\", but it's impossible to sort on " + "tokenized fields");
        list.add(term.text());
        termDocs.seek(termEnum);
        // freqList.add(termEnum.docFreq()); // doesn't take into account
        // deldocs
        int minID = -1;
        int maxID = -1;
View Full Code Here

   
    _overflow = false;
    try
    {
      tdoc = reader.termDocs();
      tenum = reader.terms(new Term(fieldName, ""));
      if (tenum != null)
      {
        do
        {
          Term term = tenum.term();
          if (term == null || !fieldName.equals(term.field()))
            break;

          String val = term.text();

          if (val != null)
          {
            list.add(val);
View Full Code Here

    _overflow = false;
    try
    {
      tdoc = reader.termDocs();
      tenum = reader.terms(new Term(fieldName, ""));
      if (tenum != null)
      {
        do
        {
          Term term = tenum.term();
          if(term == null || !fieldName.equals(term.field()))
            break;
         
          String val = term.text();
         
          if (val != null)
          {
            list.add(val);
           
View Full Code Here

    
   if ( (etype != null) && (etype.length() > 2) )
    { String[] etypes = etype.split("OR");
      for (int j = 0; j < etypes.length; j++)
      { queryString.append("contents:" + etypes[j].trim() + "^" + nf.format(wt) + " ");
        TermQuery tq = new TermQuery( new Term("contents", etypes[j])); tq.setBoost(wt);
        theQuery.add(tq, BooleanClause.Occur.SHOULD);
        entities.add(etypes[j]);
      }
     break LOOP;
    }
   }
  
  //*-------------------------------------------
  //*-- 2. Find entities in the question words
  //*-------------------------------------------
  wt = WT_ENTITY;
  for (int i = 0; i < tokenList.size(); i++)
  { if ( tokenList.get(i).type().equals("ENTITY") )  
    { String qword = tokenList.get(i).termText();
      queryString.append("contents:" + qword + "^" + nf.format(wt) + " ");
      TermQuery tq = new TermQuery( new Term("contents", qword)); tq.setBoost(wt);
      theQuery.add(tq, BooleanClause.Occur.SHOULD);
    }
  }
 
  //*-------------------------------------------------------------------------------
  //*-- 3. Create a list of weighted trigrams/bigrams/unigrams from the query
  //*-------------------------------------------------------------------------------
  int numNouns = nouns.size(); int numVerbs = verbs.size(); int numAdjectives = adjectives.size();
  String[] queryWords = question.split("\\s+"); int wordsLength = queryWords.length;
  boolean[] contentWord = new boolean[wordsLength];
  for (int i = 0; i < wordsLength; i++)
   { queryWords[i] = queryWords[i].toLowerCase(Constants.locale);
     contentWord[i] = false;
     for (int j = 0; j < nouns.size(); j++) if (queryWords[i].equalsIgnoreCase(nouns.get(j))) contentWord[i] = true;
     for (int j = 0; j < verbs.size(); j++) if (queryWords[i].equalsIgnoreCase(verbs.get(j))) contentWord[i] = true;
     for (int j = 0; j < adjectives.size(); j++) if (queryWords[i].equalsIgnoreCase(adjectives.get(j))) contentWord[i] = true;
   }
 
  String joinChar; 
  //*-- generate all possible bigrams with higher weights for bigrams that do not have stopwords
  float WT_NORM_BIGRAM = WT_BIGRAM;
  for (int i = 1; i < 4; i++) if (wordsLength > (Math.pow(2, (i + 1)))) WT_NORM_BIGRAM /= 2;
  LOOP2: for (int i = 1; i < wordsLength; i++)
  { 
   //*-- skip if the previous word was a question word
   //*-- if the previous word was a stop word use a underscore to build the bigram, otherwise use a space
   wt = 0;
   if ( !questionWords.contains(queryWords[i-1]) )
   {
     if (stopWords.contains(queryWords[i-1]) && stopWords.contains(queryWords[i])) continue LOOP2;
     joinChar = (stopWords.contains(queryWords[i-1]) || stopWords.contains(queryWords[i])) ? "_": " ";
     for (int j = i-1; j < i+1; j++) wt += (contentWord[j]) ? WT_NORM_BIGRAM: 0;
     String bigram = queryWords[i-1] + joinChar + queryWords[i];
     queryString.append("contents:\"" + bigram + "\"~0^" + wt + " ");
     PhraseQuery pq = new PhraseQuery(); pq.add( new Term("contents", bigram)); pq.setBoost(wt); pq.setSlop(0);
     theQuery.add(pq, BooleanClause.Occur.SHOULD);
     bigrams.add(bigram);
   }
  } //*-- end of for
 
  //*-- create unigrams from non-stop words and weigh unigrams near the start of the question
  //*-- higher than unigrams near the end of the question
  LOOP3: for (int i = 0; i < wordsLength; i++)
  { wt = WT_UNIGRAM;
 
    //*-- skip punctuation and very short words
    if ( (queryWords[i].length() < 2|| (!contentWord[i]) ) continue LOOP3;
   
    wt *=  ( (numNouns > 0) && (nouns.get(0).equalsIgnoreCase(queryWords[i])) ) ? 8:
           ( (numNouns > 1) && (nouns.get(1).equalsIgnoreCase(queryWords[i])) ) ? 4: 1;
    wt *=  ( (numVerbs > 0) && (verbs.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numVerbs > 1) && (verbs.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
    wt *=  ( (numAdjectives > 0) && (adjectives.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numAdjectives > 1) && (adjectives.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
  
   queryString.append("contents:" + queryWords[i] + "^" + nf.format(wt) + " ");
   TermQuery tq = new TermQuery( new Term("contents", queryWords[i])); tq.setBoost(wt);
   theQuery.add(tq, BooleanClause.Occur.SHOULD);
  } //*-- end of for

  //*--------------------------------------------------------------------------
  //*-- 4. Add the query transformation for the part. query type and add the synonyms
  //*--------------------------------------------------------------------------
/*  wt = WT_SYNONYMS;
  for (int j = 0; j < synonyms.length; j++)
  { queryString.append("contents:" + synonyms[j] + "^" + nf.format(wt) + " ");
    TermQuery tq = new TermQuery( new Term("contents", synonyms[j])); tq.setBoost(wt);
    theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  */
  wt = WT_TRANSFORM;
  Matcher matcher = whatPattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "\"" + nouns.get(0) + "_is" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"" + nouns.get(0) + "_was" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
 
  matcher = wherePattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "is_located" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"located_at\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
 
//  String query = queryString.toString();
//System.out.println("query string " + query);
View Full Code Here

     String word = tokens[i].termText().toLowerCase(Constants.locale);
    
     //*-- skip punctuation and very short words
     if ( (word.length() < 2) || (stopWords.contains(word)) ) continue LOOP;
  
     TermQuery tq = new TermQuery(new Term("contents", word));
     tq.setBoost((float) 10.0);     //*-- give an average boost to the phrases
     fullQuery.add(tq, BooleanClause.Occur.SHOULD);
   }
 
  return (fullQuery);
View Full Code Here

     //*-- number of tokens from the document to 100.
     int numTokens = 0;
     LOOP: for (int i = 0; i < tokens.length; i++)
      { String token = tokens[i].termText().toLowerCase(Constants.locale);
        if ( (stopWords.contains(token)) || (token.length() < 3)) continue;
  TermQuery tq = new TermQuery(new Term("contents", token));
  query.add(tq, BooleanClause.Occur.SHOULD);
  if (++numTokens >= 100) break LOOP;
      //*-- end of for
   } //*-- end of if
 
View Full Code Here

  else
  { try 
     { synchronized(this)
       { FSDirectory fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), false);
         IndexReader ir = IndexReader.open(fsd);
         ir.deleteDocuments(new Term("key", iDocument ) );
         ir.close();
       }
      }
      catch (IOException ie) { logger.error("Failed to delete " + iDocument + " " + ie.getMessage() ); }
     }
View Full Code Here

TOP

Related Classes of org.apache.lucene.index.Term

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.