Package it.unimi.dsi.io

Examples of it.unimi.dsi.io.OutputBitStream


    final int maxDocSize;

    if ( writeSizes ) {
      logger.info( "Combining sizes..." );
      final OutputBitStream sizesOutputBitStream = new OutputBitStream( outputBasename + DiskBasedIndex.SIZES_EXTENSION, bufferSize );
      maxDocSize = combineSizes( sizesOutputBitStream );
      sizesOutputBitStream.close();
      logger.info( "Sizes combined." );
    }
    else maxDocSize = -1;
   
    // To write the global count of each term
    final OutputBitStream outputGlobCounts = writeGlobCounts ? new OutputBitStream( outputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ) : null;
    // To write the frequency of each term
    final OutputBitStream frequencies = new OutputBitStream( outputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
    // To write the new term list
    final PrintWriter termFile = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ), bufferSize ) );
   
    // The current term
    MutableString currTerm;
   
    // Total number of pointers and occurrences
    long numPointers = 0;
   
    pl.expectedUpdates = writeGlobCounts ? numberOfOccurrences : -1;
    pl.itemsName = writeGlobCounts ? "occurrences" : "terms";
    pl.logInterval = logInterval;
    pl.start( "Combining lists..." );

    int totalFrequency, numTerms = 0, numUsedIndices, k;
    long totalGlobCount = 0;
    predictedSize = -1;
    predictedLengthNumBits = -1;
   
    // Discard first zero from offsets
    if ( p != 0 ) for( InputBitStream ibs: offsets ) ibs.readGamma();
   
    // TODO: use the front of the queue?
    while( ! termQueue.isEmpty() ) {
      numUsedIndices = 0;
      // We read a new word from the queue, copy it and write it to the term file
      currTerm = term[ k = usedIndex[ numUsedIndices++ ] = termQueue.first() ].copy();
     
      if ( DEBUG ) System.err.println( "Merging term " + currTerm );
     
      currTerm.println( termFile );
      if ( termReader[ k ].readLine( term[ k ] ) == null ) termQueue.dequeue();
      else termQueue.changed();
     
      // Then, we extract all equal words from the queue, accumulating the set of indices in inIndex and currIndex
      while( ! termQueue.isEmpty() && term[ termQueue.first() ].equals( currTerm ) ) {
        k = usedIndex[ numUsedIndices++ ] = termQueue.first();
        if ( termReader[ k ].readLine( term[ k ] ) == null ) termQueue.dequeue();
        else termQueue.changed();
      }
     
      if ( numUsedIndices > 1 ) Arrays.sort( usedIndex, 0, numUsedIndices );

      // Load index iterators
      for( int i = numUsedIndices; i-- != 0; ) indexIterator[ usedIndex[ i ] ] = indexReader[ usedIndex[ i ] ].nextIterator();

      numTerms++;

      if ( writeGlobCounts ) {
        // Compute and write the total global count. This works for all kind of indices.
        totalGlobCount = 0;
        for( int i = 0; i < numUsedIndices; i++ ) totalGlobCount += globCounts[ usedIndex[ i ] ].readLongGamma();
        outputGlobCounts.writeLongGamma( totalGlobCount );
      }
           
      if ( p != 0 ) {
        predictedSize = 0;
        predictedLengthNumBits = 0;

        for( int i = numUsedIndices; i-- != 0; ) {

          if ( index[ usedIndex[ i ] ] instanceof BitStreamHPIndex ) {
            predictedSize += offsets[ usedIndex[ i ] ].readLongGamma();
            if ( hasPositions ) predictedLengthNumBits += posNumBits[ usedIndex[ i ] ].readLongGamma();
          }
          else {
            // Interleaved index: we must subtract the number of bits used for positions from the length of the overall inverted list
            final long t = hasPositions ? posNumBits[ usedIndex[ i ] ].readLongGamma() : 0;
            predictedSize += offsets[ usedIndex[ i ] ].readLongGamma() - t;
            predictedLengthNumBits += t;
          }
        }
      }
           
      totalFrequency = combine( numUsedIndices );
      frequencies.writeGamma( totalFrequency );
      numPointers += totalFrequency;

      /* A trick to get a correct prediction. */
      if ( writeGlobCounts ) pl.count += totalGlobCount - 1;
      pl.update();
    }
    pl.done();
   
    if ( writeGlobCounts ) outputGlobCounts.close();
    termFile.close();
    frequencies.close();

    if ( ! metadataOnly ) {
      for( int i = numIndices; i-- != 0; ) {
        indexReader[ i ].close();
        if ( writeGlobCounts ) globCounts[ i ].close();
View Full Code Here


    for( int k = numUsedIndices; k-- != 0; )
      totalFrequency += ( frequency[ usedIndex[ k ] ] = indexIterator[ usedIndex[ k ] ].frequency() );

    if ( ! metadataOnly ) {
      int currIndex, numPrevDocs = 0, currDoc, count;
      OutputBitStream obs;
      Index i;
      IndexIterator ii;

      if ( p != 0 ) variableQuantumIndexWriter.newInvertedList(totalFrequency, p, predictedSize, predictedLengthNumBits );
      else indexWriter.newInvertedList();
View Full Code Here

      else indexWriter.newInvertedList();

      indexWriter.writeFrequency( totalFrequency );

      int currDoc = -1, count;
      OutputBitStream obs;
      Index i;
      IndexIterator ir;

      while( ! documentQueue.isEmpty() ) {
        // We extract the smallest document pointer, and enqueue it in the new index.
View Full Code Here

    /* If we're merging just one list, merging is fine, and moreover
     * maxCount need not be updated, as it is already initialised to
     * the maximum over all indices. */
    int currIndex, prevDoc = -1, currDoc, count;
    int temp[];
    OutputBitStream obs;
    Index i;
    IndexIterator ii;
 
    // Note that the total frequency can be computed only during the merge.
    for( int k = numUsedIndices; k-- != 0; ) {
View Full Code Here

      this.cutPoints = new IntArrayList();
      this.cutPoints.add( 0 );

      flags = new EnumMap<Component, Coding>( CompressionFlags.DEFAULT_PAYLOAD_INDEX );
      accumulatorStream = new FastByteArrayOutputStream();
      accumulator = new OutputBitStream( accumulatorStream );
    }
View Full Code Here

        accumulator.flush();
        final InputBitStream ibs = new InputBitStream( accumulatorStream.array );
        final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, indexingType == IndexingType.STANDARD ? documentCount : maxDocInBatch + 1, true, flags );
        indexWriter.newInvertedList();
        indexWriter.writeFrequency( documentCount );
        OutputBitStream obs;

        if ( indexingType == IndexingType.STANDARD ) {
          for ( int i = 0; i < documentCount; i++ ) {
            obs = indexWriter.newDocumentRecord();
            indexWriter.writeDocumentPointer( obs, i );
View Full Code Here

    final InputBitStream frequencies = new InputBitStream( inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
    final InputBitStream globCounts = new InputBitStream( inputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
    offsets.readGamma();
   
    for( int i = 0; i < numIndices; i++ ) {
      localIndexStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
      if ( isHighPerformance ) localPositionsStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize );
      localFrequencies[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.FREQUENCIES_EXTENSION );
      localGlobCounts[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );
      localOffsets[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );
      if ( posNumBits != null ) localPosNumBits[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
      localOffsets[ i ].writeGamma( 0 );
    }

    // The current term
    final MutableString currTerm = new MutableString();
View Full Code Here

   * @param quantum the quantum; it must be zero, or a power of two; if it is zero, a variable-quantum index is assumed.
   * @param height the maximum height of a skip tower; the cache will contain at most 2<sup><var>h</var></sup> document records.
   */
  public BitStreamHPIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets,  int tempBufferSize, final Map<Component,Coding> flags, final int quantum, final int height ) throws IOException {
    this(
      new OutputBitStream( basename + DiskBasedIndex.INDEX_EXTENSION ),
      new OutputBitStream( basename + DiskBasedIndex.POSITIONS_EXTENSION ),
      writeOffsets ? new OutputBitStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) : null,
      writeOffsets ? new OutputBitStream( basename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null,
      numberOfDocuments,
      tempBufferSize,
      flags, quantum , height
     );
  }
View Full Code Here

    cachePositionsLength = new long[ two2h + 1 ];
    cachePointer = new OutputBitStream[ two2h ];
    cachePointerByte = new FastByteArrayOutputStream[ two2h ];

    for ( int i = 0; i < two2h; i++ )
      cachePointer[ i ] = new OutputBitStream( cachePointerByte[ i ] = new FastByteArrayOutputStream(), 0 );

    cacheSkip = new OutputBitStream[ two2h ];
    cacheSkipBitCount = new OutputBitStream[ two2h ];
    cacheSkipByte = new FastByteArrayOutputStream[ two2h ];

    for ( int i = 0; i < two2h; i++ ) {
      cacheSkip[ i ] = new OutputBitStream( cacheSkipByte[ i ] = new FastByteArrayOutputStream(), 0 );
      cacheSkipBitCount[ i ] = new OutputBitStream( NullOutputStream.getInstance(), 0 );
    }

    skipPointer = new int[ two2h + 1 ];
    distance = new long[ two2h + 1 ];

    bitCount = new OutputBitStream( NullOutputStream.getInstance(), 0 );

    towerTopB = new int[ height + 1 ];
    towerTopLog2B = new int[ height + 1 ];
    towerLowerB = new int[ height + 1 ];
    towerLowerLog2B = new int[ height + 1 ];
View Full Code Here

    // If the previous block is over, write it out!

    if ( cache == w ) writeOutCache( pointer );

    final OutputBitStream out;
   
    // Record data pointer if we are on a skip; otherwise, write it to the cache.
    if ( ( cache & quantumModuloMask ) == 0 ) {
      if ( cache >>> quantumDivisionShift > 0 ) {
        cacheDataLength[ ( cache >>> quantumDivisionShift ) - 1 ] = (int)cacheDataOut.writtenBits();
        if ( ASSERTS ) assert positions.writtenBits() - writtenPositionsBitsAtLastQuantum <= Integer.MAX_VALUE : ( positions.writtenBits() - writtenPositionsBitsAtLastQuantum ) + " > " + Integer.MAX_VALUE;
        cachePositionsLength[ ( cache >>> quantumDivisionShift ) -1 ] = (int)( positions.writtenBits() - writtenPositionsBitsAtLastQuantum );
        writtenPositionsBitsAtLastQuantum = positions.writtenBits();
      }
      cacheDataOut.align();
      cacheDataOut.writtenBits( 0 );
      skipPointer[ cache >>> quantumDivisionShift ] = pointer;
      out = cachePointer[ cache++ >>> quantumDivisionShift ];
    }
    else {
      cache++;
      out = cacheDataOut;
    }

    currentDocument = pointer;
    int bitCount = 0;

    if ( frequency != numberOfDocuments ) { // We do not write pointers for everywhere occurring documents.
      switch( pointerCoding ) {
        case UNARY:
          bitCount = out.writeUnary( pointer - lastDocument - 1 );
          break;
        case SHIFTED_GAMMA:
          bitCount = out.writeShiftedGamma( pointer - lastDocument - 1 );
          break;
        case GAMMA:
          bitCount = out.writeGamma( pointer - lastDocument - 1 );
          break;
        case DELTA:
          bitCount = out.writeDelta( pointer - lastDocument - 1 );
          break;
        case GOLOMB:
          bitCount = out.writeGolomb( pointer - lastDocument - 1, b, log2b );
          break;
        default:
          throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." );
      }
    }
View Full Code Here

TOP

Related Classes of it.unimi.dsi.io.OutputBitStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.