Examples of it.unimi.dsi.mg4j.io.ByteArrayPostingList

it.unimi.dsi.mg4j.io.ByteArrayPostingList
Lightweight posting accumulator with format similar to that generated by {@link BitStreamIndexWriter}.
This class is essentially a dirty trick: it borrows some code and precomputed tables from {@link OutputBitStream}and exposes two simple methods ( {@link #setDocumentPointer(int)} and {@link #addPosition(int)}) with obvious semantics. The resulting posting list is compressed exactly like an {@link BitStreamIndexWriter} would do (also in this case, duplicating some logic found therein). As a result, after completing the calls and after a call to {@link #close()}the internal {@link #buffer} can be written directly to a bit stream to build an index (but see {@link #stripPointers(OutputBitStream,long)}).
{@link Scan} uses an instance of this class for each indexed term. Instances can be differential, in whichcase they assume {@link #setDocumentPointer(int)} will be called with increasing values and store gaps ratherthan document pointers. A {@linkplain Completeness completeness level} can be used to set whether an instance of this classshould store positions or counts. @author Sebastiano Vigna @since 1.2

      if ( indexingIsStandard ) {
        final OutputBitStream index = new OutputBitStream( batchBasename + DiskBasedIndex.INDEX_EXTENSION );
        final OutputBitStream offsets = new OutputBitStream( batchBasename + DiskBasedIndex.OFFSETS_EXTENSION );
        final OutputBitStream posNumBits = new OutputBitStream( batchBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );


        ByteArrayPostingList baps;
        int maxCount = 0, frequency;
        long bitLength, postings = 0, prevOffset = 0;


        offsets.writeGamma( 0 );


        for ( int i = 0; i < numTerms; i++ ) {
          baps = termMap.get( termArray[ i ] );
          frequency = baps.frequency;


          if ( maxCount < baps.maxCount ) maxCount = baps.maxCount;
          bitLength = baps.writtenBits();
          baps.align();


          postings += frequency;


          index.writeGamma( frequency - 1 );
  
          // We need special treatment for terms appearing in all documents
          if ( frequency == documentCount ) baps.stripPointers( index, bitLength );
          else index.write( baps.buffer, bitLength );


          frequencies.writeGamma( frequency );
          globCounts.writeLongGamma( baps.globCount );
          offsets.writeLongGamma( index.writtenBits() - prevOffset );
          posNumBits.writeLongGamma( baps.posNumBits );
          prevOffset = index.writtenBits();
        }


        totPostings += postings;


        final Properties properties = new Properties();
        properties.setProperty( Index.PropertyKeys.DOCUMENTS, documentCount );
        properties.setProperty( Index.PropertyKeys.TERMS, numTerms );
        properties.setProperty( Index.PropertyKeys.POSTINGS, postings );
        properties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
        properties.setProperty( Index.PropertyKeys.INDEXCLASS, FileIndex.class.getName() );
        properties.addProperty( Index.PropertyKeys.CODING, "FREQUENCIES:GAMMA" );
        properties.addProperty( Index.PropertyKeys.CODING, "POINTERS:DELTA" );
        if ( completeness.compareTo( Completeness.COUNTS ) >= 0 ) properties.addProperty( Index.PropertyKeys.CODING, "COUNTS:GAMMA" );
        if ( completeness.compareTo( Completeness.POSITIONS ) >= 0 ) properties.addProperty( Index.PropertyKeys.CODING, "POSITIONS:DELTA" );
        properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
        properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );
        properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );
        properties.setProperty( Index.PropertyKeys.SIZE, index.writtenBits() );
        if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );
        properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
        index.close();
        offsets.close();
        posNumBits.close();


      }
      else {
        final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, maxDocInBatch + 1, true, flags );


        ByteArrayPostingList bapl;
        OutputBitStream obs;
        int maxCount = -1, maxFrequency = 0, frequency;
        // Compute max frequency and allocate position array.
        for ( ByteArrayPostingList b : termMap.values() ) {
          b.close();

View Full Code Here

   * @param wordReader the word reader associated to the document.
   */
  public void processDocument( final int documentPointer, final WordReader wordReader ) throws IOException {
    int pos = indexingIsVirtual ? currSize[ documentPointer ] : 0;
    final int actualPointer = indexingIsStandard ? documentCount : documentPointer;
    ByteArrayPostingList termBapl;


    word.length( 0 );
    nonWord.length( 0 );


    while ( wordReader.next( word, nonWord ) ) {
      if ( builder != null ) builder.add( word, nonWord );
      if ( word.length() == 0 ) continue;
      if ( !termProcessor.processTerm( word ) ) {
        pos++; // We do consider the positions of terms canceled out by the term processor.
        continue;
      }


      // We check whether we have already seen this term. If not, we add it to the term map.
      if ( ( termBapl = termMap.get( word ) ) == null ) {
        try {
          termBapl = new ByteArrayPostingList( new byte[ BYTE_ARRAY_POSTING_LIST_INITIAL_SIZE ], indexingIsStandard, completeness );
          termMap.put( word.copy(), termBapl );
        }
        catch( OutOfMemoryError e ) {
          /* There is not enough memory for enlarging the table. We set a very low growth factor, so at
           * the next put() the enlargement will likely succeed. If not, we will generate several
           * out-of-memory error, but we should get to the end anyway, and we will 
           * dump the current batch as soon as the current document is finished. */
          outOfMemoryError = true;
        }
        numTerms++;
        if ( numTerms % TERM_REPORT_STEP == 0 ) LOGGER.info( "[" + Util.format( numTerms ) + " term(s)]" );
      }


      // We now record the occurrence. If a renumbering map has
      // been specified, we have to renumber the document index through it.
      termBapl.setDocumentPointer( actualPointer );
      termBapl.addPosition( pos );
      // Record whether this posting list has an out-of-memory-error problem.
      if ( termBapl.outOfMemoryError ) outOfMemoryError = true;
      occsInCurrDoc++;
      numOccurrences++;
      pos++;

View Full Code Here

TOP

Related Classes of it.unimi.dsi.mg4j.io.ByteArrayPostingList

it.unimi.dsi.io.InputBitStream

it.unimi.dsi.mg4j.tool.Scan

java.io.File

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.