Package org.terrier.structures

Examples of org.terrier.structures.LexiconEntry


    for (int i = 0; i < queryLength; i++) {
     
      //get the entry statistics - perhaps this came from "far away"
      EntryStatistics entryStats = queryTerms.getStatistics(queryTermStrings[i]);
      //we seek the query term in the lexicon
      LexiconEntry lEntry = lexicon.getLexiconEntry(queryTermStrings[i]);
      if (entryStats == null)
        entryStats = lEntry;
     
      //and if it is not found, we continue with the next term
      if (lEntry==null)
      {
        //logger.info("Term Not Found: "+queryTermStrings[i]);
        continue;
      }
      queryTerms.setTermProperty(queryTermStrings[i], lEntry);
      logger.debug((i + 1) + ": " + queryTermStrings[i].trim() + " with " + entryStats.getDocumentFrequency()
          + " documents (TF is " + entryStats.getFrequency() + ").");
     
      //check if the IDF is very low.
      if (IGNORE_LOW_IDF_TERMS && collectionStatistics.getNumberOfDocuments() < lEntry.getFrequency()) {
        logger.debug("query term " + queryTermStrings[i] + " has low idf - ignored from scoring.");
        continue;
      }
     
      //the weighting models are prepared for assigning scores to documents
View Full Code Here


    FSOMapFileLexiconOutputStream.addLexiconToIndex(this.index, defaultStructureName, lexiconEntryFactoryValueClass+"$Factory");
  }
 
  protected LexiconEntry newLexiconEntry(int termid)
  {
    LexiconEntry rtr = valueFactory.newInstance();
    rtr.setTermId(termid);
    return rtr;
  }
View Full Code Here

      }
       
    }
    String targetTerm= null;
    int targetTermId  = -1;
    LexiconEntry nextEntryToWrite = null;
    while(terms.size() > 0)
    {
      //what term are we working on
      targetTerm = terms.poll();
      //logger.debug("Current term is "+targetTerm + "length="+targetTerm.length());
      //for each input lexicon
      for(int i=0;i<numLexicons;i++)
      {
        //does this lexicon contain the term
        //logger.debug("Checking lexicon "+i+" for "+targetTerm+"="+lis[i].getTerm());
        if(hasMore[i] && currentEntries[i].getKey().equals(targetTerm))
        {
          if (targetTermId == -1)
          {  //obtain the termid for this term from the first lexicon that has the term
            nextEntryToWrite = newLexiconEntry(targetTermId = currentEntries[i].getValue().getTermId());
          }
          else if (targetTermId != currentEntries[i].getValue().getTermId())
          {  //check the termids match for this term
            logger.error("Term "+targetTerm+" had two termids ("+targetTermId+","+currentEntries[i].getValue().getTermId()+")");
          }
          //logger.debug("Term "+targetTerm + " found in "+i + "termid="+ lis[i].getTermId());
          nextEntryToWrite.add(currentEntries[i].getValue());
          hasMore[i] = lis[i].hasNext();
         
          if (hasMore[i])
          {
            currentEntries[i] = lis[i].next();
View Full Code Here

      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      los.close();
      dis.close();
      Files.delete(LexiconFilename.concat(".tmp2"));
View Full Code Here

    lastFreq = 0;
    lastDocFreq= 0;
    lastDocument = -1;
    long startOffset = this.getByteOffset();
    byte startBitOffset = this.getBitOffset();
    LexiconEntry le = null;
    // for each run in the list
    int counter = 0;
    //for one term: for each set of postings for that term
    while (run.hasNext()) {
      PostingInRun posting = run.next();
      lastTermWritten = posting.getTerm();
     
      if (posting.getDf() > maxDF)
        maxDF = posting.getDf();
     
      //final int _runMapID = TaskID.forName(_run.getMapNo()).getId();
      //final int runNumber = run.getRunNo();
      final int docOffset = getDocumentOffset(_run.getSplitNo(), _run.getRunNo());
      lastDocument = posting.append(bos, lastDocument, docOffset);
      if (le == null)
        le = posting.getLexiconEntry();
      else
        posting.addToLexiconEntry(le);
      lastFreq += posting.getTF();
      lastDocFreq += posting.getDf();
      counter++;
    }
    le.setTermId(currentTerm++);
    ((BasicLexiconEntry)le).setOffset(startOffset, startBitOffset);
    lexStream.writeNextEntry(lastTermWritten, le);
    numberOfPointers += lastDocFreq;
  }
View Full Code Here

      DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));
      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      IndexUtil.close(lexiconStream);
      los.close();
      dis.close();
View Full Code Here

   
      if (! lexiconStream.hasNext())
        break;
     
      Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
      LexiconEntry le = lee.getValue();
     
      _processTerms++;     
      numberOfPointersThisIteration += le.getDocumentFrequency();   
      tmpStorageStorage.add(createPointerForTerm(le));
     
      //the class TIntIntHashMap return zero when you look up for a
      //the value of a key that does not exist in the hash map.
      //For this reason, the values that will be inserted in the
      //hash map are increased by one.
      codesHashMap.put(le.getTermId(), j + 1);
     
      //increment counter
      j++;
    }
    if (logger.isDebugEnabled())
View Full Code Here

   
      if (! lexiconStream.hasNext())
        break;
   
      Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
      LexiconEntry le = lee.getValue();
   
      TIntArrayList[] tmpArray = new TIntArrayList[2 + fieldCount];
      final int tmpNT = le.getDocumentFrequency();
      for (int i=0;i<2+fieldCount;i++)
      {
        tmpArray[i] = new TIntArrayList(tmpNT);
      }
     
      numberOfPointersThisIteration += tmpNT;
     
      tmpStorage[j] = tmpArray;
     
     
      //the class TIntIntHashMap return zero when you look up for a
      //the value of a key that does not exist in the hash map.
      //For this reason, the values that will be inserted in the
      //hash map are increased by one.
      codesHashMap.put(le.getTermId(), j + 1);
    }
    if (logger.isDebugEnabled())
      logger.debug(
        numberOfPointersThisIteration + " pointers == "+
        j +" terms");
View Full Code Here

   
    // We purge the query terms not present in the lexicon and retrieve the information from the lexicon
    String[] queryTermStrings = queryTerms.getTerms();
    queryTermsToMatchList = new ArrayList<Map.Entry<String,LexiconEntry>>(queryTermStrings.length);
    for (String queryTerm: queryTermStrings) {
      LexiconEntry t = lexicon.getLexiconEntry(queryTerm);
      if (t != null) {
        //check if the term IDF is very low.
        if (IGNORE_LOW_IDF_TERMS && collectionStatistics.getNumberOfDocuments() < t.getFrequency()) {
          //logger.warn("query term " + queryTerm + " has low idf - ignored from scoring.");
          continue;
        }
        // check if the term has weighting models
        WeightingModel[] termWeightingModels = queryTerms.getTermWeightingModels(queryTerm);
        if (termWeightingModels.length == 0) {
          //logger.warn("No weighting models for term " + queryTerm +", skipping scoring");
          continue;
        }
        queryTermsToMatchList.add(new MapEntry<String, LexiconEntry>(queryTerm, t));
      }
      else
        logger.debug("Term Not Found: " + queryTerm);     
    }

    ////logger.warn("queryTermsToMatchList = " + queryTermsToMatchList.size());
    int queryLength = queryTermsToMatchList.size();
   
    wm = new WeightingModel[queryLength][];
    for (int i = 0; i < queryLength; i++)
    {
      Map.Entry<String, LexiconEntry> termEntry    = queryTermsToMatchList.get(i);
      String               queryTerm    = termEntry.getKey();
      LexiconEntry           lexiconEntry = termEntry.getValue();
      //get the entry statistics - perhaps this came from "far away"
      EntryStatistics entryStats = queryTerms.getStatistics(queryTerm);
      //if none were provided with the query we seek the entry statistics query term in the lexicon
      if (entryStats == null)
      {
View Full Code Here

   * @param wmodels weighting models to be applied for this query term
   * @throws IOException
   */
  public void addSingleTerm(String queryTerm, double weight, EntryStatistics entryStats, WeightingModel[] wmodels) throws IOException
  {
    LexiconEntry t = lexicon.getLexiconEntry(queryTerm);
    if (t == null) {
      logger.debug("Term Not Found: " + queryTerm);
      //previousTerm = false;     
    } else if (IGNORE_LOW_IDF_TERMS && collectionStatistics.getNumberOfDocuments() < t.getFrequency()) {
      //logger.warn("query term " + queryTerm + " has low idf - ignored from scoring.");
      //previousTerm = false;
    } else if (wmodels.length == 0) {
      //logger.warn("No weighting models for term " + queryTerm +", skipping scoring");
      //previousTerm = false;
View Full Code Here

TOP

Related Classes of org.terrier.structures.LexiconEntry

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.