Package it.unimi.dsi.mg4j.index

Examples of it.unimi.dsi.mg4j.index.IndexWriter


      LOGGER.debug( "Generating index " + batchBasename + "; documents: " + documentCount );

      try {
        accumulator.flush();
        final InputBitStream ibs = new InputBitStream( accumulatorStream.array );
        final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, indexingType == IndexingType.STANDARD ? documentCount : maxDocInBatch + 1, true, flags );
        indexWriter.newInvertedList();
        indexWriter.writeFrequency( documentCount );
        OutputBitStream obs;

        if ( indexingType == IndexingType.STANDARD ) {
          for ( int i = 0; i < documentCount; i++ ) {
            obs = indexWriter.newDocumentRecord();
            indexWriter.writeDocumentPointer( obs, i );
            payload.read( ibs );
            indexWriter.writePayload( obs, payload );
          }
        }
        else {
          // We sort position by pointed document pointer.
          LongArrays.quickSort( position, 0, documentCount, new AbstractLongComparator() {
            public int compare( final long position0, final long position1 ) {
              try {
                ibs.position( position0 );
                final int d0 = ibs.readDelta();
                ibs.position( position1 );
                return d0 - ibs.readDelta();
              }
              catch ( IOException e ) {
                throw new RuntimeException( e );
              }
            }
          } );
          for ( int i = 0; i < documentCount; i++ ) {
            obs = indexWriter.newDocumentRecord();
            ibs.position( position[ i ] );
            indexWriter.writeDocumentPointer( obs, ibs.readDelta() );
            payload.read( ibs );
            indexWriter.writePayload( obs, payload );
          }

          maxDocInBatch = 0;
        }

        indexWriter.close();

        final Properties properties = indexWriter.properties();
        totPostings += properties.getLong( "postings" );
        properties.setProperty( Index.PropertyKeys.OCCURRENCES, -1 );
        properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1 );
        properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() );
        properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, NullTermProcessor.class.getName() );
        properties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
        if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );
        properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
View Full Code Here


      if ( totDocuments == 0 ) {
        // Special case: no document has been indexed. We generate an empty batch.
        final String batchBasename = batchBasename( 0, basename, batchDir );
        LOGGER.debug( "Generating empty index " + batchBasename );

        final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, 0, true, flags );
        indexWriter.close();
        final Properties properties = indexWriter.properties();
        properties.setProperty( Index.PropertyKeys.SIZE, 0 );
        properties.setProperty( Index.PropertyKeys.OCCURRENCES, -1 );
        properties.setProperty( Index.PropertyKeys.MAXCOUNT, -1 );
        properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1 );
        properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, NullTermProcessor.class.getName() );
View Full Code Here

        offsets.close();
        posNumBits.close();

      }
      else {
        final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, maxDocInBatch + 1, true, flags );

        ByteArrayPostingList bapl;
        OutputBitStream obs;
        int maxCount = -1, maxFrequency = 0, frequency;
        // Compute max frequency and allocate position array.
        for ( ByteArrayPostingList b : termMap.values() ) {
          b.close();
          b.align();
          if ( maxFrequency < b.frequency ) maxFrequency = b.frequency;
          if ( maxCount < b.maxCount ) maxCount = b.maxCount;
        }

        final long[] bitPos = new long[ maxFrequency ];
        final int[] pointer = new int[ maxFrequency ];
        int[] pos = new int[ maxCount ];
        final boolean hasCounts = completeness.compareTo( Completeness.COUNTS ) >= 0;
        final boolean hasPositions = completeness.compareTo( Completeness.POSITIONS ) >= 0;
        int count = -1, moreCount = -1;
       
        for ( int i = 0; i < numTerms; i++ ) {
          bapl = termMap.get( termArray[ i ] );
          final InputBitStream ibs = new InputBitStream( bapl.buffer );
          frequency = bapl.frequency; // This could be much more than the actual frequency in virtual indices

          // Calculate posting bit positions and corresponding pointers
          for ( int j = 0; j < frequency; j++ ) {
            bitPos[ j ] = ibs.readBits(); // Cache bit poisition
            pointer[ j ] = ibs.readDelta(); // Cache pointer
            if ( hasCounts ) count = ibs.readGamma() + 1;
            if ( hasPositions ) ibs.skipDeltas( count ); // Skip document positions
          }

          // Sort stably pointers and positions by increasing pointer
          it.unimi.dsi.fastutil.Arrays.quickSort( 0, frequency, new AbstractIntComparator() {
            public int compare( final int i0, final int i1 ) {
              final int t = pointer[ i0 ] - pointer[ i1 ];
              if ( t != 0 ) return t;
              final long u = bitPos[ i0 ] - bitPos[ i1 ]; // We need a stable sort
              return u < 0 ? -1 : u > 0 ? 1 : 0;
            }
          },
          new Swapper() {
            public void swap( final int i0, final int i1 ) {
              final long t = bitPos[ i0 ]; bitPos[ i0 ] = bitPos[ i1 ]; bitPos[ i1 ] = t;
              final int p = pointer[ i0 ]; pointer[ i0 ] = pointer[ i1 ]; pointer[ i1 ] = p;
            }
          } );

          int actualFrequency = frequency;
          // Compute actual frequency for virtual indices
          if ( indexingIsVirtual ) {
            actualFrequency = 1;
            for ( int j = 1; j < frequency; j++ ) if ( pointer[ j ] != pointer[ j - 1 ] ) actualFrequency++;
            if ( ASSERTS ) {
              for ( int j = 1; j < frequency; j++ ) {
                assert pointer[ j ] >= pointer[ j - 1 ];
                assert pointer[ j ] != pointer[ j - 1 ] || bitPos[ j ] > bitPos[ j - 1 ];
              }
            }
          }

          indexWriter.newInvertedList();
          indexWriter.writeFrequency( actualFrequency );

          int currPointer;
          for ( int j = 0; j < frequency; j++ ) {
            ibs.position( bitPos[ j ] );
            obs = indexWriter.newDocumentRecord();
            indexWriter.writeDocumentPointer( obs, currPointer = ibs.readDelta() );
            if ( ASSERTS ) assert currPointer == pointer[ j ];
            if ( hasCounts ) count = ibs.readGamma() + 1;
            if ( hasPositions ) {
              ibs.readDeltas( pos, count );
              for ( int p = 1; p < count; p++ ) pos[ p ] += pos[ p - 1 ] + 1;
            }

            if ( indexingIsVirtual ) {
              while( j < frequency - 1 ) {
                ibs.position( bitPos[ j + 1 ] );
                if ( currPointer != ibs.readDelta() ) break;
                j++;
                if ( hasCounts ) moreCount = ibs.readGamma() + 1;
                if ( hasPositions ) {
                  pos = IntArrays.grow( pos, count + moreCount, count );
                  pos[ count ] = ibs.readDelta();
                  if ( ASSERTS ) assert pos[ count ] > pos[ count - 1 ];
                  for ( int p = 1; p < moreCount; p++ ) pos[ count + p ] = pos[ count + p - 1 ] + 1 + ibs.readDelta();
                }
                count += moreCount;
              }
              if ( maxCount < count ) maxCount = count;
            }

            if ( hasCounts ) indexWriter.writePositionCount( obs, count );
            if ( hasPositions ) indexWriter.writeDocumentPositions( obs, pos, 0, count, -1 );
          }

          frequencies.writeGamma( actualFrequency );
          globCounts.writeLongGamma( bapl.globCount );
        }

        indexWriter.close();
        final Properties properties = indexWriter.properties();
        totPostings += properties.getLong( "postings" );
        properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
        properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );
        properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );
        properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() );
        if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );
        properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );

        if ( indexingIsRemapped ) {
          // We must permute sizes
View Full Code Here

      makeEmpty( batchBasename + DiskBasedIndex.TERMS_EXTENSION );
      makeEmpty( batchBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
      makeEmpty( batchBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      if ( ! indexingIsVirtual ) sizes.close();
     
      final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, totDocuments, true, flags );
      indexWriter.close();
      final Properties properties = indexWriter.properties();
      properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
      properties.setProperty( Index.PropertyKeys.OCCURRENCES, 0 );
      properties.setProperty( Index.PropertyKeys.MAXCOUNT, 0 );
      properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );
      properties.setProperty( Index.PropertyKeys.SIZE, 0 );
View Full Code Here

TOP

Related Classes of it.unimi.dsi.mg4j.index.IndexWriter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.