Package it.unimi.dsi.io

Examples of it.unimi.dsi.io.OutputBitStream


   
    int i;
    int prev = -1;
    int bitCount = 0;
    final int end = offset + len;
    final OutputBitStream positions = this.positions;
   
    switch( positionCoding ) {
      case GAMMA:
        if ( COOKIES ) bitCount += positions.writeGamma( Integer.MAX_VALUE );
        for( i = offset; i < end; i++ ) {
          bitCount += positions.writeGamma( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      case DELTA:
        if ( COOKIES bitCount += positions.writeDelta( Integer.MAX_VALUE );
        for( i = offset; i < end; i++ ) {
          bitCount += positions.writeDelta( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      case SHIFTED_GAMMA:
        if ( COOKIES ) bitCount += positions.writeShiftedGamma( Integer.MAX_VALUE );
        for( i = offset; i < end; i++ ) {
          bitCount += positions.writeShiftedGamma( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      default:
        throw new IllegalStateException( "The required position coding (" + positionCoding + ") is not supported." );
View Full Code Here


                outputStream.writeInt( indexIterator.nextDocument() );
                bufSize--;
              }
              if ( index.hasPayloads ) {
                // TODO: this is *very rough* & preliminary
                OutputBitStream obs = new OutputBitStream( outputStream );
                index.payload.write( obs );
                obs.flush();
              }
              if ( index.hasCounts ) {
                outputStream.writeInt( count = indexIterator.count() );
                bufSize--;
                if ( index.hasPositions ) {
View Full Code Here

    final PrintWriter pw = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( batchBasename + DiskBasedIndex.TERMS_EXTENSION ), bufferSize ), "UTF-8" ) );
    for ( MutableString t : termArray ) t.println( pw );
    pw.close();

    try {
      final OutputBitStream frequencies = new OutputBitStream( batchBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
      final OutputBitStream globCounts = new OutputBitStream( batchBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );

      if ( indexingIsStandard ) {
        final OutputBitStream index = new OutputBitStream( batchBasename + DiskBasedIndex.INDEX_EXTENSION );
        final OutputBitStream offsets = new OutputBitStream( batchBasename + DiskBasedIndex.OFFSETS_EXTENSION );
        final OutputBitStream posNumBits = new OutputBitStream( batchBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );

        ByteArrayPostingList baps;
        int maxCount = 0, frequency;
        long bitLength, postings = 0, prevOffset = 0;

        offsets.writeGamma( 0 );

        for ( int i = 0; i < numTerms; i++ ) {
          baps = termMap.get( termArray[ i ] );
          frequency = baps.frequency;

          if ( maxCount < baps.maxCount ) maxCount = baps.maxCount;
          bitLength = baps.writtenBits();
          baps.align();

          postings += frequency;

          index.writeGamma( frequency - 1 );
 
          // We need special treatment for terms appearing in all documents
          if ( frequency == documentCount ) baps.stripPointers( index, bitLength );
          else index.write( baps.buffer, bitLength );

          frequencies.writeGamma( frequency );
          globCounts.writeLongGamma( baps.globCount );
          offsets.writeLongGamma( index.writtenBits() - prevOffset );
          posNumBits.writeLongGamma( baps.posNumBits );
          prevOffset = index.writtenBits();
        }

        totPostings += postings;

        final Properties properties = new Properties();
        properties.setProperty( Index.PropertyKeys.DOCUMENTS, documentCount );
        properties.setProperty( Index.PropertyKeys.TERMS, numTerms );
        properties.setProperty( Index.PropertyKeys.POSTINGS, postings );
        properties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
        properties.setProperty( Index.PropertyKeys.INDEXCLASS, FileIndex.class.getName() );
        properties.addProperty( Index.PropertyKeys.CODING, "FREQUENCIES:GAMMA" );
        properties.addProperty( Index.PropertyKeys.CODING, "POINTERS:DELTA" );
        if ( completeness.compareTo( Completeness.COUNTS ) >= 0 ) properties.addProperty( Index.PropertyKeys.CODING, "COUNTS:GAMMA" );
        if ( completeness.compareTo( Completeness.POSITIONS ) >= 0 ) properties.addProperty( Index.PropertyKeys.CODING, "POSITIONS:DELTA" );
        properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
        properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );
        properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );
        properties.setProperty( Index.PropertyKeys.SIZE, index.writtenBits() );
        if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );
        properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
        index.close();
        offsets.close();
        posNumBits.close();

      }
      else {
        final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, maxDocInBatch + 1, true, flags );

        ByteArrayPostingList bapl;
        OutputBitStream obs;
        int maxCount = -1, maxFrequency = 0, frequency;
        // Compute max frequency and allocate position array.
        for ( ByteArrayPostingList b : termMap.values() ) {
          b.close();
          b.align();
          if ( maxFrequency < b.frequency ) maxFrequency = b.frequency;
          if ( maxCount < b.maxCount ) maxCount = b.maxCount;
        }

        final long[] bitPos = new long[ maxFrequency ];
        final int[] pointer = new int[ maxFrequency ];
        int[] pos = new int[ maxCount ];
        final boolean hasCounts = completeness.compareTo( Completeness.COUNTS ) >= 0;
        final boolean hasPositions = completeness.compareTo( Completeness.POSITIONS ) >= 0;
        int count = -1, moreCount = -1;
       
        for ( int i = 0; i < numTerms; i++ ) {
          bapl = termMap.get( termArray[ i ] );
          final InputBitStream ibs = new InputBitStream( bapl.buffer );
          frequency = bapl.frequency; // This could be much more than the actual frequency in virtual indices

          // Calculate posting bit positions and corresponding pointers
          for ( int j = 0; j < frequency; j++ ) {
            bitPos[ j ] = ibs.readBits(); // Cache bit poisition
            pointer[ j ] = ibs.readDelta(); // Cache pointer
            if ( hasCounts ) count = ibs.readGamma() + 1;
            if ( hasPositions ) ibs.skipDeltas( count ); // Skip document positions
          }

          // Sort stably pointers and positions by increasing pointer
          it.unimi.dsi.fastutil.Arrays.quickSort( 0, frequency, new AbstractIntComparator() {
            public int compare( final int i0, final int i1 ) {
              final int t = pointer[ i0 ] - pointer[ i1 ];
              if ( t != 0 ) return t;
              final long u = bitPos[ i0 ] - bitPos[ i1 ]; // We need a stable sort
              return u < 0 ? -1 : u > 0 ? 1 : 0;
            }
          },
          new Swapper() {
            public void swap( final int i0, final int i1 ) {
              final long t = bitPos[ i0 ]; bitPos[ i0 ] = bitPos[ i1 ]; bitPos[ i1 ] = t;
              final int p = pointer[ i0 ]; pointer[ i0 ] = pointer[ i1 ]; pointer[ i1 ] = p;
            }
          } );

          int actualFrequency = frequency;
          // Compute actual frequency for virtual indices
          if ( indexingIsVirtual ) {
            actualFrequency = 1;
            for ( int j = 1; j < frequency; j++ ) if ( pointer[ j ] != pointer[ j - 1 ] ) actualFrequency++;
            if ( ASSERTS ) {
              for ( int j = 1; j < frequency; j++ ) {
                assert pointer[ j ] >= pointer[ j - 1 ];
                assert pointer[ j ] != pointer[ j - 1 ] || bitPos[ j ] > bitPos[ j - 1 ];
              }
            }
          }

          indexWriter.newInvertedList();
          indexWriter.writeFrequency( actualFrequency );

          int currPointer;
          for ( int j = 0; j < frequency; j++ ) {
            ibs.position( bitPos[ j ] );
            obs = indexWriter.newDocumentRecord();
            indexWriter.writeDocumentPointer( obs, currPointer = ibs.readDelta() );
            if ( ASSERTS ) assert currPointer == pointer[ j ];
            if ( hasCounts ) count = ibs.readGamma() + 1;
            if ( hasPositions ) {
              ibs.readDeltas( pos, count );
              for ( int p = 1; p < count; p++ ) pos[ p ] += pos[ p - 1 ] + 1;
            }

            if ( indexingIsVirtual ) {
              while( j < frequency - 1 ) {
                ibs.position( bitPos[ j + 1 ] );
                if ( currPointer != ibs.readDelta() ) break;
                j++;
                if ( hasCounts ) moreCount = ibs.readGamma() + 1;
                if ( hasPositions ) {
                  pos = IntArrays.grow( pos, count + moreCount, count );
                  pos[ count ] = ibs.readDelta();
                  if ( ASSERTS ) assert pos[ count ] > pos[ count - 1 ];
                  for ( int p = 1; p < moreCount; p++ ) pos[ count + p ] = pos[ count + p - 1 ] + 1 + ibs.readDelta();
                }
                count += moreCount;
              }
              if ( maxCount < count ) maxCount = count;
            }

            if ( hasCounts ) indexWriter.writePositionCount( obs, count );
            if ( hasPositions ) indexWriter.writeDocumentPositions( obs, pos, 0, count, -1 );
          }

          frequencies.writeGamma( actualFrequency );
          globCounts.writeLongGamma( bapl.globCount );
        }

        indexWriter.close();
        final Properties properties = indexWriter.properties();
        totPostings += properties.getLong( "postings" );
        properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
        properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );
        properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );
        properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() );
        if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );
        properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );

        if ( indexingIsRemapped ) {
          // We must permute sizes
          final int[] document = new int[ documentCount ], size = new int[ documentCount ];
          final InputBitStream sizes = new InputBitStream( batchBasename + DiskBasedIndex.SIZES_EXTENSION );
          for ( int i = 0; i < documentCount; i++ ) {
            document[ i ] = sizes.readGamma();
            size[ i ] = sizes.readGamma();
          }
          sizes.close();
         
          it.unimi.dsi.fastutil.Arrays.quickSort( 0, documentCount, new AbstractIntComparator() {
            public int compare( int x, int y ) {
              return document[ x ] - document[ y ];
            }
          }, new Swapper() {
            public void swap( int x, int y ) {
              int t = document[ x ];
              document[ x ] = document[ y ];
              document[ y ] = t;
              t = size[ x ];
              size[ x ] = size[ y ];
              size[ y ] = t;
            }
          } );


          final OutputBitStream permutedSizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );
          for ( int i = 0, d = 0; i < documentCount; i++ ) {
            while ( d++ < document[ i ] )
              permutedSizes.writeGamma( 0 );
            permutedSizes.writeGamma( size[ i ] );
          }
          permutedSizes.close();
        }
      }
     
      if ( indexingIsVirtual ) {
        final OutputBitStream sizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );
        for ( int i = 0; i < currSize.length; i++ ) sizes.writeGamma( currSize[ i ] );
        sizes.close();
      }

      globCounts.close();
      frequencies.close();
      termMap.clear();
View Full Code Here

      throw e;
    }
  }

  protected void openSizeBitStream() throws FileNotFoundException {
    if ( ! indexingIsVirtual ) sizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );
  }
View Full Code Here

    return basename;
  }
   
  public void open( final CharSequence suffix ) throws IOException {
    basenameSuffix = basename + suffix;
    documentsOutputBitStream = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.DOCUMENTS_EXTENSION );
    termsOutputStream = new CountingOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + SimpleCompressedDocumentCollection.TERMS_EXTENSION ) ) );
    nonTermsOutputStream = exact ? new CountingOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + SimpleCompressedDocumentCollection.NONTERMS_EXTENSION ) ) ) : null;
    documentOffsetsObs = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.DOCUMENT_OFFSETS_EXTENSION );
    termOffsetsObs = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.TERM_OFFSETS_EXTENSION );
    nonTermOffsetsObs = exact? new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.NONTERM_OFFSETS_EXTENSION ) : null;
    fieldContent = new IntArrayList();

    if ( hasNonText ) nonTextZipDataOutputStream = new DataOutputStream( nonTextZipOutputStream = new ZipOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + ZipDocumentCollection.ZIP_EXTENSION ) ) ) );

    terms.clear();
View Full Code Here

   * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
   * @param quantum the quantum; it must be zero, or a power of two; if it is zero, a variable-quantum index is assumed.
   * @param height the maximum height of a skip tower; the cache will contain at most <var>2<sup>h</sup></var> document records.
   */
  public SkipBitStreamIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, final Map<Component,Coding> flags, final int quantum, final int height ) throws IOException {
    this( new OutputBitStream( basename + DiskBasedIndex.INDEX_EXTENSION ),
        writeOffsets ? new OutputBitStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) : null,
        writeOffsets && flags.get( Component.POSITIONS ) != null ? new OutputBitStream( basename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null,
        numberOfDocuments, DEFAULT_TEMP_BUFFER_SIZE, flags, quantum, height );
  }
View Full Code Here

   * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
   * @param quantum the quantum; it must be zero, or a power of two; if it is zero, a variable-quantum index is assumed.
   * @param height the maximum height of a skip tower; the cache will contain at most 2<sup><var>h</var></sup> document records.
   */
  public SkipBitStreamIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, int tempBufferSize, final Map<Component,Coding> flags, final int quantum, final int height ) throws IOException {
    this( new OutputBitStream( basename + DiskBasedIndex.INDEX_EXTENSION ),
        writeOffsets ? new OutputBitStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) : null,
        writeOffsets && flags.get( Component.POSITIONS ) != null ? new OutputBitStream( basename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null,
        numberOfDocuments, tempBufferSize, flags, quantum, height );
  }
View Full Code Here

    cacheDataLength = new int[ two2h ];
    cachePointer = new OutputBitStream[ two2h ];
    cachePointerByte = new FastByteArrayOutputStream[ two2h ];

    for ( int i = 0; i < two2h; i++ )
      cachePointer[ i ] = new OutputBitStream( cachePointerByte[ i ] = new FastByteArrayOutputStream(), 0 );

    cacheSkip = new OutputBitStream[ two2h ];
    cacheSkipBitCount = new OutputBitStream[ two2h ];
    cacheSkipByte = new FastByteArrayOutputStream[ two2h ];

    for ( int i = 0; i < two2h; i++ ) {
      cacheSkip[ i ] = new OutputBitStream( cacheSkipByte[ i ] = new FastByteArrayOutputStream(), 0 );
      cacheSkipBitCount[ i ] = new OutputBitStream( NullOutputStream.getInstance(), 0 );
    }

    skipPointer = new int[ two2h + 1 ];
    distance = new long[ two2h + 1 ];

    bitCount = new OutputBitStream( NullOutputStream.getInstance(), 0 );

    towerTopB = new int[ height + 1 ];
    towerTopLog2B = new int[ height + 1 ];
    towerLowerB = new int[ height + 1 ];
    towerLowerLog2B = new int[ height + 1 ];
View Full Code Here

      String name = localBasename[ i ];
      if ( ! interleaved ) indexWriter[ i ] = new BitStreamHPIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height );
      else if ( ! skips ) indexWriter[ i ] = new BitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, writerFlags );
      else indexWriter[ i ] = new SkipBitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height );
     
      if ( haveCounts ) localGlobCounts[ i ] = new OutputBitStream( name + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      localFrequencies[ i ] = new OutputBitStream( name + DiskBasedIndex.FREQUENCIES_EXTENSION );
      localTerms[ i ] = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) );     
    }
   
    terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
  }
View Full Code Here

  private void partitionSizes() throws IOException {     
    final File sizesFile = new File( inputBasename + DiskBasedIndex.SIZES_EXTENSION );
    if ( sizesFile.exists() ) {
      LOGGER.info( "Partitioning sizes..." );
      final InputBitStream sizes = new InputBitStream ( sizesFile );
      final OutputBitStream localSizes[] = new OutputBitStream[ numIndices ];
      for ( int i = 0; i < numIndices; i++ ) localSizes[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );

      // ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents.
      int size, localIndex;
      if ( globalIndex.numberOfDocuments == strategy.numberOfDocuments( 0 ) ) {
        for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {
View Full Code Here

TOP

Related Classes of it.unimi.dsi.io.OutputBitStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.