Examples of it.unimi.dsi.logging.ProgressLogger

it.unimi.dsi.logging.ProgressLogger

  protected abstract int combine( int numUsedIndices ) throws IOException;
  
  
  public void run() throws ConfigurationException, IOException {
    final Logger logger = Util.getLogger( this.getClass() );
    final ProgressLogger pl = new ProgressLogger( logger, logInterval );
    pl.displayFreeMemory = true;


    final int maxDocSize;


    if ( writeSizes ) {
      logger.info( "Combining sizes..." );
      final OutputBitStream sizesOutputBitStream = new OutputBitStream( outputBasename + DiskBasedIndex.SIZES_EXTENSION, bufferSize );
      maxDocSize = combineSizes( sizesOutputBitStream );
      sizesOutputBitStream.close();
      logger.info( "Sizes combined." );
    }
    else maxDocSize = -1;
    
    // To write the global count of each term
    final OutputBitStream outputGlobCounts = writeGlobCounts ? new OutputBitStream( outputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ) : null;
    // To write the frequency of each term
    final OutputBitStream frequencies = new OutputBitStream( outputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
    // To write the new term list
    final PrintWriter termFile = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ), bufferSize ) );
    
    // The current term
    MutableString currTerm;
    
    // Total number of pointers and occurrences
    long numPointers = 0;
    
    pl.expectedUpdates = writeGlobCounts ? numberOfOccurrences : -1;
    pl.itemsName = writeGlobCounts ? "occurrences" : "terms";
    pl.logInterval = logInterval;
    pl.start( "Combining lists..." );


    int totalFrequency, numTerms = 0, numUsedIndices, k;
    long totalGlobCount = 0;
    predictedSize = -1;
    predictedLengthNumBits = -1;
    
    // Discard first zero from offsets
    if ( p != 0 ) for( InputBitStream ibs: offsets ) ibs.readGamma();
    
    // TODO: use the front of the queue?
    while( ! termQueue.isEmpty() ) {
      numUsedIndices = 0;
      // We read a new word from the queue, copy it and write it to the term file
      currTerm = term[ k = usedIndex[ numUsedIndices++ ] = termQueue.first() ].copy();
      
      if ( DEBUG ) System.err.println( "Merging term " + currTerm );
      
      currTerm.println( termFile );
      if ( termReader[ k ].readLine( term[ k ] ) == null ) termQueue.dequeue();
      else termQueue.changed();
      
      // Then, we extract all equal words from the queue, accumulating the set of indices in inIndex and currIndex
      while( ! termQueue.isEmpty() && term[ termQueue.first() ].equals( currTerm ) ) {
        k = usedIndex[ numUsedIndices++ ] = termQueue.first();
        if ( termReader[ k ].readLine( term[ k ] ) == null ) termQueue.dequeue();
        else termQueue.changed();
      }
      
      if ( numUsedIndices > 1 ) Arrays.sort( usedIndex, 0, numUsedIndices );


      // Load index iterators
      for( int i = numUsedIndices; i-- != 0; ) indexIterator[ usedIndex[ i ] ] = indexReader[ usedIndex[ i ] ].nextIterator();


      numTerms++;


      if ( writeGlobCounts ) {
        // Compute and write the total global count. This works for all kind of indices.
        totalGlobCount = 0;
        for( int i = 0; i < numUsedIndices; i++ ) totalGlobCount += globCounts[ usedIndex[ i ] ].readLongGamma();
        outputGlobCounts.writeLongGamma( totalGlobCount );
      }
            
      if ( p != 0 ) {
        predictedSize = 0;
        predictedLengthNumBits = 0;


        for( int i = numUsedIndices; i-- != 0; ) {


          if ( index[ usedIndex[ i ] ] instanceof BitStreamHPIndex ) {
            predictedSize += offsets[ usedIndex[ i ] ].readLongGamma();
            if ( hasPositions ) predictedLengthNumBits += posNumBits[ usedIndex[ i ] ].readLongGamma();
          }
          else {
            // Interleaved index: we must subtract the number of bits used for positions from the length of the overall inverted list
            final long t = hasPositions ? posNumBits[ usedIndex[ i ] ].readLongGamma() : 0;
            predictedSize += offsets[ usedIndex[ i ] ].readLongGamma() - t;
            predictedLengthNumBits += t;
          }
        }
      }
            
      totalFrequency = combine( numUsedIndices );
      frequencies.writeGamma( totalFrequency );
      numPointers += totalFrequency;


      /* A trick to get a correct prediction. */
      if ( writeGlobCounts ) pl.count += totalGlobCount - 1;
      pl.update();
    }
    pl.done();
    
    if ( writeGlobCounts ) outputGlobCounts.close();
    termFile.close();
    frequencies.close();

View Full Code Here

    localBasename = new String[ numIndices ];
    for( int i = 0; i < numIndices; i++ ) localBasename[ i ] = outputBasename + "-" + i;
  }
  
  public void runTermsOnly() throws IOException {
    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
    
    final PrintWriter[] localTerms = new PrintWriter[ numIndices ]; 
    final int numTerms[] = new int[ numIndices ];
    final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
    
    for( int i = 0; i < numIndices; i++ ) localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );


    // The current term
    final MutableString currTerm = new MutableString();
    
    pl.itemsName = "terms";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index terms..." );


    int termNumber = 0, k;
    
    while( terms.readLine( currTerm ) != null ) {
      k = strategy.localIndex( termNumber ); // The local index for this term
      if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();
      numTerms[ k ]++;
      currTerm.println( localTerms[ k ] );
      pl.update();
      termNumber++;
    }


    terms.close();
    for( int i = 0; i < numIndices; i++ ) localTerms[ i ].close();


    pl.done();
  }

View Full Code Here


    pl.done();
  }
  
  public void run() throws ConfigurationException, IOException, ClassNotFoundException {
    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
    final byte[] buffer = new byte[ bufferSize ];
    
    final OutputBitStream[] localIndexStream = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localPositionsStream = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localOffsets = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localPosNumBits = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localFrequencies = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localGlobCounts = new OutputBitStream[ numIndices ];
    final PrintWriter[] localTerms = new PrintWriter[ numIndices ]; 
    final int numTerms[] = new int[ numIndices ];
    final long numberOfOccurrences[] = new long[ numIndices ];
    final long numberOfPostings[] = new long[ numIndices ];
    
    final boolean isHighPerformance = BitStreamHPIndex.class.isAssignableFrom( Class.forName( new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ).getString( Index.PropertyKeys.INDEXCLASS ) ) );
    
    final InputBitStream globalIndex = new InputBitStream( inputBasename + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
    final long globalPositionsLength = new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length();
    final InputBitStream globalPositions = isHighPerformance ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize ) : null;
    final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
    final InputBitStream offsets = new InputBitStream( inputBasename + DiskBasedIndex.OFFSETS_EXTENSION );
    
    final File posNumBitsFile = new File( inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
    final InputBitStream posNumBits = posNumBitsFile.exists() ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null;
    final InputBitStream frequencies = new InputBitStream( inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
    final InputBitStream globCounts = new InputBitStream( inputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
    offsets.readGamma();
    
    for( int i = 0; i < numIndices; i++ ) {
      localIndexStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
      if ( isHighPerformance ) localPositionsStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize );
      localFrequencies[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.FREQUENCIES_EXTENSION );
      localGlobCounts[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );
      localOffsets[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );
      if ( posNumBits != null ) localPosNumBits[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
      localOffsets[ i ].writeGamma( 0 );
    }


    // The current term
    final MutableString currTerm = new MutableString();
    
    pl.expectedUpdates = ( new File( inputBasename + DiskBasedIndex.INDEX_EXTENSION ).length() + ( isHighPerformance ? new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length() : 0 ) ) * 8;
    pl.itemsName = "bits";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );


    int termNumber = 0, k, prevK = -1, previousHeaderLength = 0, newHeaderLength = 0;
    long length, count, positionsOffset = 0;
    int res, frequency;
    
    while( terms.readLine( currTerm ) != null ) {
      k = strategy.localIndex( termNumber ); // The local index for this term
      if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();
      numTerms[ k ]++;
      
      if ( isHighPerformance ) {
        final long temp = globalIndex.readBits();
        positionsOffset = globalIndex.readLongDelta();
        previousHeaderLength = (int)( globalIndex.readBits() - temp );
        if ( prevK != -1 ) {
          length = positionsOffset - globalPositions.readBits();
          pl.count += length;
          while( length > 0 ) {
            res = (int)Math.min( bufferSize * 8, length );
            globalPositions.read( buffer, res );
            localPositionsStream[ prevK ].write( buffer, res );
            length -= res;
          }
        }
        newHeaderLength = localIndexStream[ k ].writeLongDelta( localPositionsStream[ k ].writtenBits() );
      }
      
      
      frequency = frequencies.readGamma();
      localFrequencies[ k ].writeGamma( frequency );
      numberOfPostings[ k ] += frequency;


      if ( posNumBits != null ) localPosNumBits[ k ].writeGamma( posNumBits.readGamma() );
      
      count = globCounts.readLongGamma();
      numberOfOccurrences[ k ] += count;
      localGlobCounts[ k ].writeLongGamma( count );
      
      currTerm.println( localTerms[ k ] );
      
      length = offsets.readLongGamma() - previousHeaderLength;
      localOffsets[ k ].writeLongGamma( length + newHeaderLength );
      pl.count += length + previousHeaderLength - 1;
      
      while( length > 0 ) {
        res = (int)Math.min( bufferSize * 8, length );
        globalIndex.read( buffer, res );
        localIndexStream[ k ].write( buffer, res );
        length -= res;
      }
      
      pl.update();
      prevK = k;
      termNumber++;
    }


    // We pour the last piece of positions
    if ( isHighPerformance ) {
      if ( prevK != -1 ) {
        length = globalPositionsLength * 8 - globalPositions.readBits();
        System.err.println( globalPositionsLength * 8 - globalPositions.readBits() );
        while( length > 0 ) {
          res = (int)Math.min( bufferSize * 8, length );
          globalPositions.read( buffer, res );
          localPositionsStream[ prevK ].write( buffer, res );
          length -= res;
        }
      }
    }


    pl.done();


    terms.close();
    offsets.close();
    frequencies.close();
    globCounts.close();

View Full Code Here


    final Scan[] scan = new Scan[ numberOfIndexedFields ]; // To scan textual content
    final PayloadAccumulator[] accumulator = new PayloadAccumulator[ numberOfIndexedFields ]; // To accumulate
    // document data


    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval, "documents" );
    if ( documentSequence instanceof DocumentCollection ) pl.expectedUpdates = ( (DocumentCollection)documentSequence ).size();
  
    
    for ( int i = 0; i < numberOfIndexedFields; i++ ) {
      final String fieldName = factory.fieldName( indexedField[ i ] );
      switch ( factory.fieldType( indexedField[ i ] ) ) {
      case TEXT:
        scan[ i ] = new Scan( basename + '-' + fieldName, fieldName, completeness, termProcessor, map != null ? IndexingType.REMAPPED
            : IndexingType.STANDARD, 0, 0, bufferSize, builder, tempDir );
        break;
      case VIRTUAL:
        scan[ i ] = new Scan( basename + '-' + fieldName, fieldName, completeness, termProcessor, IndexingType.VIRTUAL,
            virtualDocumentResolver[ i ].numberOfDocuments(), virtualGap[ i ], bufferSize, builder, tempDir );
        break;


      case DATE:
        accumulator[ i ] = new PayloadAccumulator( basename + '-' + fieldName, new DatePayload(), fieldName,
            map != null ? IndexingType.REMAPPED : IndexingType.STANDARD, documentsPerBatch, tempDir );
        break;
      case INT:
        accumulator[ i ] = new PayloadAccumulator( basename + '-' + fieldName, new IntegerPayload(), fieldName,
            map != null ? IndexingType.REMAPPED : IndexingType.STANDARD, documentsPerBatch, tempDir );
        break;
      default:


      }
    }


    if ( building ) builder.open( "@0" ); // First batch
    
    pl.displayFreeMemory = true;
    pl.start( "Indexing documents..." );


    DocumentIterator iterator = documentSequence.iterator();
    Reader reader;
    WordReader wordReader;
    ObjectList<VirtualDocumentFragment> fragments;
    Document document;


    int documentPointer = 0, documentsInBatch = 0;
    long batchStartTime = System.currentTimeMillis();
    boolean outOfMemoryError = false;


    while ( ( document = iterator.nextDocument() ) != null ) {
      
      long overallTerms = 0;
      if ( building ) builder.startDocument( document.title(), document.uri() );
      for ( int i = 0; i < numberOfIndexedFields; i++ ) {
        switch ( factory.fieldType( indexedField[ i ] ) ) {
        case TEXT:
          reader = (Reader)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          wordReader.setReader( reader );
          if ( building ) builder.startTextField();
          scan[ i ].processDocument( map != null ? map[ documentPointer ] : documentPointer, wordReader );
          if ( building ) builder.endTextField();
          overallTerms += scan[ i ].numTerms;
          break;
        case VIRTUAL:
          fragments = (ObjectList<VirtualDocumentFragment>)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          virtualDocumentResolver[ i ].context( document );
          for( VirtualDocumentFragment fragment: fragments ) {
            int virtualDocumentPointer = virtualDocumentResolver[ i ].resolve( fragment.documentSpecifier() );
            if ( virtualDocumentPointer < 0 ) continue;
            if ( map != null ) virtualDocumentPointer = map[ virtualDocumentPointer ];
            wordReader.setReader( new FastBufferedReader( fragment.text() ) );
            scan[ i ].processDocument( virtualDocumentPointer, wordReader );
          }
          if ( building ) builder.virtualField( fragments );
          overallTerms += scan[ i ].numTerms;
          break;
        default:
          Object o = document.content( indexedField[ i ] );
          accumulator[ i ].processData( map != null ? map[ documentPointer ] : documentPointer, o );
          if ( building ) builder.nonTextField( o );
          break;
        }


        if ( scan[ i ] != null && scan[ i ].outOfMemoryError ) outOfMemoryError = true;
      }
      if ( building ) builder.endDocument();
      documentPointer++;
      documentsInBatch++;
      document.close();
      pl.update();


      // We try compaction if we detect less than PERC_AVAILABLE_MEMORY_CHECK memory available
      long percAvailableMemory = Util.percAvailableMemory();
      boolean compacted = false;
      if ( ! outOfMemoryError && percAvailableMemory < PERC_AVAILABLE_MEMORY_CHECK ) {
        LOGGER.info( "Starting compaction... (" + percAvailableMemory + "% available)" );
        compacted = true;
        Util.compactMemory();
        percAvailableMemory = Util.percAvailableMemory();
        LOGGER.info( "Compaction completed (" + percAvailableMemory + "% available)" );
      }
      
      if ( outOfMemoryError || overallTerms >= maxTerms || documentsInBatch == documentsPerBatch || ( compacted && percAvailableMemory < PERC_AVAILABLE_MEMORY_DUMP ) ) {
        if ( outOfMemoryError ) LOGGER.warn( "OutOfMemoryError during buffer reallocation: writing a batch of " + documentsInBatch + " documents" );
        else if ( overallTerms >= maxTerms ) LOGGER.warn( "Too many terms (" + overallTerms + "): writing a batch of " + documentsInBatch + " documents" );
        else if ( compacted && percAvailableMemory < PERC_AVAILABLE_MEMORY_DUMP ) LOGGER.warn( "Available memory below " + PERC_AVAILABLE_MEMORY_DUMP + "%: writing a batch of " + documentsInBatch + " documents" );


        long occurrences = 0;
        for ( int i = 0; i < numberOfIndexedFields; i++ ) {
          switch ( factory.fieldType( indexedField[ i ] ) ) {
          case TEXT:
          case VIRTUAL:
            occurrences += scan[ i ].dumpBatch();
            scan[ i ].openSizeBitStream();
            break;
          default:
            accumulator[ i ].writeData();
          }
        }
        
        if ( building ) {
          builder.close();
          builder.open( "@" + scan[ 0 ].batch );
        }


        LOGGER.info( "Last set of batches indexed at " + Util.format( ( 1000. * occurrences ) / ( System.currentTimeMillis() - batchStartTime ) ) + " occurrences/s" );
        batchStartTime = System.currentTimeMillis();
        documentsInBatch = 0;
        outOfMemoryError = false;
      }
    }


    iterator.close();
    if ( builder != null ) builder.close();


    for ( int i = 0; i < numberOfIndexedFields; i++ ) {
      switch ( factory.fieldType( indexedField[ i ] ) ) {
      case TEXT:
      case VIRTUAL:
        scan[ i ].close();
        break;
      default:
        accumulator[ i ].close();
        break;
      }


    }


    documentSequence.close();
    
    pl.done();


    if ( building ) {
      final String name = new File( builder.basename() ).getName();
      final String[] collectionName = new String[ scan[ 0 ].batch ];
      for ( int i = scan[ 0 ].batch; i-- != 0; ) collectionName[ i ] = name + "@" + i + DocumentCollection.DEFAULT_EXTENSION;

View Full Code Here

    int words[] = new int[ 1024 ];
    final FastBufferedReader reader = new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ) );
    
    int lineNumber = 0;
    int numberOfPartialQueries = queries;
    ProgressLogger pl = new ProgressLogger( LOGGER );
    pl.itemsName = "Klines";
    pl.expectedUpdates = maxDoc / 1000;
    pl.start( "Generating queries..." );
    MutableString line = new MutableString();
    while( reader.readLine( line ) != null && numberOfPartialQueries > 0 ) {
      if ( used[ lineNumber ] ) {
        for ( q = 0; q < queries; q++ ) 
          if ( coveredForQuery[ q ] < docperquery && docForQuery[ q ][ coveredForQuery[ q ] ] == lineNumber ) {
            split = line.toString().split( " " );
            int nw = split.length;
            words = IntArrays.ensureCapacity( words, nw + 1 );
            for ( i = 0; i < nw; i++ ) words[ i ] = i;
            for ( i = 0; i < Math.min( wordsperdoc, nw ); i++ ) {
              j = i + (int)( ( nw - i ) * Math.random() );
              t = words[ i ]; words[ i ] = words[ j ]; words[ j ] = t;
              query[ q ][ coveredForQuery[ q ] ][ i ] = split[ words[ i ] ];
            }
            coveredForQuery[ q ]++;
            if ( coveredForQuery[ q ] == docperquery ) numberOfPartialQueries--;
          }
      }
      lineNumber++;
      if ( lineNumber % 1000 == 0 ) pl.update();
    }
    pl.done();


    MutableString p[] = new MutableString[ Math.max( queries, wordsperdoc ) ], s = new MutableString();
    for( i = 0; i < p.length; i++ ) p[ i ] = new MutableString();


    for ( q = 0; q < queries; q++ ) {

View Full Code Here

      for ( int i = 0; i < numIndices; i++ ) localSizes[ i ].close();
    }
  }
  
  public void run() throws Exception {
    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
    final IntList sizeList = globalIndex.sizes;
    partitionSizes();
    
    final int[] position = new int[ globalIndex.maxCount ];  
    final int[] localFrequency = new int[ numIndices ];  
    final int[] usedIndex = new int[ numIndices ];
    final InputBitStream[] direct = new InputBitStream[ numIndices ];
    final InputBitStream[] indirect = new InputBitStream[ numIndices ];
    final BloomFilter[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[ numIndices ] : null;
    final File[] tempFile = new File[ numIndices ];
    final CachingOutputBitStream[] temp = new CachingOutputBitStream[ numIndices ];
    IndexIterator indexIterator;
    
    for ( int i = 0; i < numIndices; i++ ) {
      tempFile[ i ] = new File( localBasename[ i ] + ".temp" );
      temp[ i ] = new CachingOutputBitStream( tempFile[ i ], bufferSize );
      direct[ i ] = new InputBitStream( temp[ i ].buffer() );
      indirect[ i ] = new InputBitStream( tempFile[ i ] );
      if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );
    }
    int usedIndices;
    MutableString currentTerm = new MutableString();
    Payload payload = null;
    int frequency, globalPointer, localIndex, localPointer, count = -1;


    pl.expectedUpdates = globalIndex.numberOfPostings;
    pl.itemsName = "postings";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );


    for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {
      terms.readLine( currentTerm );
      indexIterator = indexReader.nextIterator();
      usedIndices = 0;
      frequency = indexIterator.frequency();
      
      for ( int j = 0; j < frequency; j++ ) {
        globalPointer = indexIterator.nextDocument();                
        localIndex = strategy.localIndex( globalPointer );  


        if ( localFrequency[ localIndex ] == 0 ) {
          // First time we see a document for this index.
          currentTerm.println( localTerms[ localIndex ] );
          numTerms[ localIndex ]++;
          usedIndex[ usedIndices++ ] = localIndex;
          if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );
        }
        
        /* Store temporarily posting data; note that we save the global pointer as we
         * will have to access the size list. */
        
        localFrequency[ localIndex ]++;
        numPostings[ localIndex ]++;
        temp[ localIndex ].writeGamma( globalPointer );


        if ( globalIndex.hasPayloads ) payload = indexIterator.payload();
        if ( havePayloads ) payload.write( temp[ localIndex ] );
        
        if ( haveCounts ) {
          count = indexIterator.count();
          temp[ localIndex ].writeGamma( count );
          globCount[ localIndex ] += count;        
          if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count;         
          if ( havePositions ) {
            final int[] pos = indexIterator.positionArray();
            // TODO: compress this stuff
            for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] ); 
          }
        }
      }
      
      // We now run through the indices used by this term and copy from the temporary buffer.


      OutputBitStream obs;
      
      for( int k = 0; k < usedIndices; k++ ) {
        final int i = usedIndex[ k ];


        localFrequencies[ i ].writeGamma( localFrequency[ i ] );
        if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];
        if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );
        globCount[ i ] = 0;
        
        InputBitStream ibs;
        indexWriter[ i ].newInvertedList();


        temp[ i ].align();
        if ( temp[ i ].buffer() != null ) ibs = direct[ i ];
        else {
          // We cannot read directly from the internal buffer.
          ibs = indirect[ i ];
          ibs.flush();
          temp[ i ].flush();
        }


        ibs.position( 0 );
          
        indexWriter[ i ].writeFrequency( localFrequency[ i ] );
        for( int j = 0; j < localFrequency[ i ]; j++ ) {
          obs = indexWriter[ i ].newDocumentRecord();
          globalPointer = ibs.readGamma();
          localPointer = strategy.localPointer( globalPointer );  
          indexWriter[ i ].writeDocumentPointer( obs, localPointer );
          if ( havePayloads ) {
            payload.read( ibs );
            indexWriter[ i ].writePayload( obs, payload );
          }
          if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );
          if ( havePositions ) {
            for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();
            indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );
          }
          
        }
        temp[ i ].position( 0 );
        temp[ i ].writtenBits( 0 );
        localFrequency[ i ] = 0;
      }
      
      usedIndices = 0;
      pl.count += frequency - 1;
      pl.update();
    }


    pl.done();


    Properties globalProperties = new Properties();
    globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );

View Full Code Here

    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );
    if ( jsapResult.userSpecified( "titles" ) ) titleStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "titles" ) ) );
    
    MutableString s = new MutableString();


    ProgressLogger progressLogger = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "documents" );
    if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();
    progressLogger.start( "Scanning..." );
    
    while( ( document = documentIterator.nextDocument() ) != null ) {
      if ( uriStream != null ) {
        s.replace( document.uri() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( uriStream );
        uriStream.write( '\n' );
      }
      if ( titleStream != null ) {
        s.replace( document.title() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( titleStream );
        titleStream.write( '\n' );
      }
      progressLogger.lightUpdate();
    }
    
    progressLogger.done();
    if ( uriStream != null ) uriStream.close();
    if ( titleStream != null ) titleStream.close();
  }

View Full Code Here

    if ( uniqueURIs ) filter = new BloomFilter( jsapResult.getInt( "uniqueUris" ) );
    
    final Collection<? extends CharSequence> collection;
    if ( termFile == null ) {
      ArrayList<MutableString> termList = new ArrayList<MutableString>();
      final ProgressLogger pl = new ProgressLogger();
      pl.itemsName = "URIs";
      final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ), bufferSize ), pl );
      
      pl.start( "Reading URIs..." );
      MutableString uri;
      while( termIterator.hasNext() ) {
        uri = termIterator.next();
        if ( uniqueURIs ) makeUnique( filter, uri );
        termList.add( uri.copy() );
      }
      pl.done();
      
      collection = termList;
    }
    else {
      if ( uniqueURIs ) {
        // Create temporary file with unique URIs
        final ProgressLogger pl = new ProgressLogger();
        pl.itemsName = "URIs";
        pl.start( "Copying URIs..." );
        final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( new FileInputStream( termFile ) ), bufferSize ), pl );
        File temp = File.createTempFile( URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris" );
        temp.deleteOnExit();
        termFile = temp.toString();
        final FastBufferedOutputStream outputStream = new FastBufferedOutputStream( new FileOutputStream( termFile ), bufferSize );
        MutableString uri;
        while( termIterator.hasNext() ) {
          uri = termIterator.next();
          makeUnique( filter, uri );
          uri.writeUTF8( outputStream );
          outputStream.write( '\n' );
        }
        pl.done();
        outputStream.close();
      }
      collection = new FileLinesCollection( termFile, "UTF-8" );
    }
    LOGGER.debug( "Building function..." );

View Full Code Here


    final boolean isVirtual = jsapResult.getBoolean( "virtual" );


    int i, t = 0;


    final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "ints" );
    final Index[] index = stem ? new Index[ indexedField.length ] : new Index[ 1 ];
    final int numberOfTerms[] = new int[ indexedField.length ];
    final ObjectArrayList<MutableString>[] terms = new ObjectArrayList[ indexedField.length ];
    final IndexReader[] indexReader = new IndexReader[ index.length ];
    final InputBitStream[] frequencies = new InputBitStream[ index.length ];
    final int[][] count = new int[ index.length ][];
    final int[] permutation = permutationFile != null ? BinIO.loadInts( permutationFile ) : null;
    final int[][] occ = new int[ index.length ][];
    final int[][] wordInPos = new int[ index.length ][];
    final Int2IntMap[] termsInDoc = new Int2IntOpenHashMap[ index.length ];
    int totalTerms = 0;
    
    boolean allBitStreamIndices = true;
    
    for( i = 0; i < index.length; i++ ) {
      final String basenameField = basename + (stem ? "-" + factory.fieldName( indexedField[ i ] ) : "" );
      index[ i ] = Index.getInstance( basenameField );
      if ( ! ( index[ i ] instanceof BitStreamIndex ) ) allBitStreamIndices = false;
      
      if ( termLists ) {
        terms[ i ] = new ObjectArrayList<MutableString>( new FileLinesCollection( basenameField + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).allLines() );
        numberOfTerms[ i ] = terms[ i ].size();
      }
      else numberOfTerms[ i ] = index[ i ].numberOfTerms;
      totalTerms += numberOfTerms[ i ];
      
      // This will be matched with the number of occurrences per document
      count[ i ] = new int[ index[ i ].numberOfDocuments ];


      occ[ i ] = index[ i ].maxCount > 0 ? new int[ index[ i ].maxCount ] : IntArrays.EMPTY_ARRAY;
      wordInPos[ i ] = new int[ Math.max( 0, index[ i ].properties.getInt( Index.PropertyKeys.MAXDOCSIZE ) ) ];
      indexReader[ i ] = index[ i ].getReader();
      
      if ( new File( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION ).exists() ) frequencies[ i ] = new InputBitStream( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION );
      termsInDoc[ i ] = new Int2IntOpenHashMap();
    }




    int currDoc = 0,
    // Term position in the current document.
    pos = 0, f = 0, p;


    pl.itemsName = "lists";
    pl.expectedUpdates = totalTerms;
    
    int indexFrequency = -1;
    
    // Sequential scan
    if ( !jsapResult.getBoolean( "noSeq" ) ) {
      try {
        for ( i = 0; i < index.length; i++ ) {
          int numberOfPostings = 0;
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start( "Verifying sequentially index " + index[ i ] + "..." );


          if ( allBitStreamIndices ) {
            for ( t = 0; t < numberOfTerms[ i ]; t++ ) {
              pl.update();
              IndexIterator indexIterator = indexReader[ i ].nextIterator();
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if ( frequencies[ i ] != null && indexFrequency != ( f = frequencies[ i ].readGamma() ) ) {
                System.err.println( "Error in frequency for term " + t + ": expected " + f + " documents, found " + indexFrequency );
                return;
              }


              while ( indexFrequency-- != 0 ) {
                p = indexIterator.nextDocument();
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
                if (index[i].hasPositions) indexIterator.positionArray(); // Just to force reading in high-performance indices
              }
              if ( indexIterator.nextDocument() != -1 ) throw new AssertionError( "nextDocument() is not -1 after exhaustive iteration" );
            }
            
            // Check document sizes
            if ( ! isVirtual && ( (BitStreamIndex) index[ i ] ).sizes != null && index[ i ].hasCounts )
              for ( p = 0; p < index[ i ].numberOfDocuments; p++ )
                if ( index[ i ].sizes.getInt( p ) != count[ i ][ p ] )
                  System.err.println( "Document " + p + " has size " + ( (BitStreamIndex) index[ i ] ).sizes.getInt( p ) + " but " + count[ i ][ p ] + " occurrences have been stored." );
            
          }
          else { // Non-bitstream indices
            for (t = 0; t < numberOfTerms[ i ]; t++) {
              pl.update();
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if (frequencies[i] != null && indexFrequency != (f = frequencies[i].readGamma())) {
                System.err.println("Error in frequency for term " + t
                    + ": expected " + f + " documents, found "
                    + indexFrequency);
                return;
              }
              
              int prevp = -1;
              while (indexFrequency-- != 0) {
                p = indexIterator.nextDocument();
                if ( prevp >= p ) throw new AssertionError( "previous pointer: " + prevp + "; current pointer: " + p );
                prevp = p;
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
              }
            }
          }
          pl.done();
          
          if ( ! isVirtual && numberOfPostings != index[ i ].numberOfPostings ) System.err.println( "Index declares " + index[ i ].numberOfPostings + " postings, but we found " + numberOfPostings );
          long numberOfOccurrences = 0;
          if ( index[ i ].hasCounts ) {
            for ( p = 0; p < index[ i ].numberOfDocuments; p++ ) numberOfOccurrences += count[ i ][ p ];
            if ( numberOfOccurrences != index[ i ].numberOfOccurrences ) System.err.println( "Index declares " + index[ i ].numberOfOccurrences + " occurrences, but we found " + numberOfOccurrences );
          }
        }
      } catch ( Exception e ) {
        System.err.println( "Exception while scanning sequentially term " + t + " of index " + index[ i ] );
        System.err.println( "Term frequency was " + f + " and position " + ( f - indexFrequency - 1 ) );
        throw e;
      }
    }
  
    IntArrayList l = new IntArrayList();
    ObjectArrayList<int[]> positions = new ObjectArrayList<int[]>();
    
    if ( ! jsapResult.getBoolean( "noSkip" ) ) {
      int start = 0, end = 0, result;
      try {
        for (i = 0; i < index.length; i++) {
          
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying all skips in " + index[i] + "...");


          for (t = 0; t < numberOfTerms[ i ]; t++) {
            l.clear();
            positions.clear();
            IndexIterator documents = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
            int d;
            while( ( d = documents.nextDocument() ) != -1 ) {
              l.add( d );
              if ( index[ i ].hasPositions ) positions.add( ArrayUtils.subarray( documents.positionArray(), 0, documents.count() ) );
            }
            
            for( start = 0; start < l.size(); start++ ) {
              for( end = start + 1; end < l.size(); end++ ) {
                IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
                
                result = indexIterator.skipTo( l.getInt( start ) );
                if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
                result = indexIterator.skipTo( l.getInt( end ) );
                if ( indexIterator.document() != l.getInt( end ) || result != l.getInt( end ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( end ) + " (term " + t + ") after a skip to " + start + " moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
                
                if ( index[ i ].hasPositions ) {
                  // This catches wrong state reconstruction after skips.
                  indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
                  indexIterator.skipTo( l.getInt( start ) );
                  if ( indexIterator.document() != l.getInt( start ) ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( start ) );
                  if ( indexIterator.count() != positions.get( start ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( start ).length );
                  if ( ! Arrays.equals( positions.get( start ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( start ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                  indexIterator.skipTo( l.getInt( end ) );
                  if ( indexIterator.document() != l.getInt( end )  ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( end ) );
                  if ( indexIterator.count() != positions.get( end ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( end ).length );
                  if ( ! Arrays.equals( positions.get( end ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( end ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                }
                
              }
              
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              
              result = indexIterator.skipTo( l.getInt( start ) );
              if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError("Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
              result = indexIterator.skipTo( Integer.MAX_VALUE );
              if ( indexIterator.hasNext() || result != Integer.MAX_VALUE ) throw new AssertionError("Trying to skip beyond end of list (term " + t + ") after a skip to " + start + " returned " + result + " (hasNext()=" + indexIterator.hasNext() + ")" );
              
              
            }
            pl.update();
          }
          pl.done();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during all-skip test (index=" + index[ i ] + ", term=" + t + ", start=" + start + ", end=" + end + ")" );
        throw e;
      }
     }
    


    if ( ! jsapResult.getBoolean( "noComp" ) ) {
      IndexReader additionalReader;
      IntLinkedOpenHashSet s0 = new IntLinkedOpenHashSet();
      IntOpenHashSet s1 = new IntOpenHashSet();
      IntAVLTreeSet s2 = new IntAVLTreeSet();
      IntIterator it;
      IndexIterator indexIterator, additionalIterator;
      it.unimi.dsi.mg4j.search.DocumentIterator documentIterator;
      int u = 0;
      
      try {
        for (i = 0; i < index.length; i++) {
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying composite iterators in " + index[i] + "...");
          additionalReader = index[ i ].getReader();
          
          for (t = 0; t < numberOfTerms[ i ]; t++) {
            for (u = 0; u < numberOfTerms[ i ]; u++) {
              s0.clear();
              s1.clear();
              // TODO: in case we have positions, we should check them, too
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s0 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s1 );
              s0.retainAll( s1 );
              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );
              it = s0.iterator();
              documentIterator = AndDocumentIterator.getInstance( indexIterator, additionalIterator );
              for( int j = s0.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();


              s2.clear();
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s2 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s2 );


              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );


              it = s2.iterator();
              documentIterator = OrDocumentIterator.getInstance( indexIterator, additionalIterator ); 
              for( int j = s2.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();
            }  
            pl.update();
          }
          pl.done();
          additionalReader.close();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during composite iterator test (index=" + index[ i ] + ", first term=" + t + ", second term =" + u + ")" );
        throw e;
      }  
    }
    
    if ( ! isVirtual && jsapResult.getBoolean( "random" ) ) {
      
      // Random access scan
      pl.expectedUpdates = index[ 0 ].numberOfDocuments;
      pl.itemsName = "documents";
      pl.start( "Verifying random access..." );


      if ( allBitStreamIndices ) {
        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();
        
        int docCounter = 0;
        
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;


          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );  
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents(  0  );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                } 
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }


              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();


                  IndexIterator indexIterator = indexReader[ i ].documents( t );


                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    } 
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );


                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  } 
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );


        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();
        
        int docCounter = 0;
        
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;


          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );  
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents( "#" );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                } 
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              pos = 0;
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                IndexIterator indexIterator = indexReader[ i ].documents( word );
                if ( currDoc != indexIterator.skipTo( currDoc ) )
                  LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + word );
                else if ( index[ i ].hasPositions ) {
                  indexIterator.positions( occ[ i ] );
                  if ( IntArrayList.wrap( occ[ i ], indexIterator.count() ).indexOf( pos ) == -1 )
                    LOGGER.error( index[ i ] + ": Position " + pos + " does not appear in the position list of term " + word + " in document " + currDoc );
                }
                pos++;
              }
            }
          }
          document.close();
          pl.update();
          docCounter++;
        }
      }


      pl.done();
    }
    
    for( IndexReader ir : indexReader ) ir.close();
  }

View Full Code Here

    this.factory = factory;
    this.bufferSize = bufferSize;
    this.descriptors = new ObjectArrayList<TRECDocumentDescriptor>();
    this.useGzip = useGzip;


    final ProgressLogger progressLogger = new ProgressLogger( LOGGER );
    progressLogger.expectedUpdates = file.length;
    progressLogger.itemsName = "files";


    progressLogger.start( "Parsing " + ( useGzip ? "GZip" : "plain" ) + " files" );


    for ( int i = 0; i < file.length; i++ ) {
      parseContent( i, openFileStream( file[ i ] ) );
      progressLogger.update();
    }


    progressLogger.done();
  }

View Full Code Here

0 1

TOP

Related Classes of it.unimi.dsi.logging.ProgressLogger

bixo.fetcher.simulation.WebgraphWebServer

it.unimi.dsi.mg4j.document.TRECDocumentCollection

it.unimi.dsi.mg4j.document.WikipediaDocumentCollection

it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder

it.unimi.dsi.mg4j.test.ProduceDNFFromLines

it.unimi.dsi.mg4j.test.Verifier

it.unimi.dsi.mg4j.tool.Combine

it.unimi.dsi.mg4j.tool.PartitionDocumentally

it.unimi.dsi.mg4j.tool.PartitionLexically

it.unimi.dsi.mg4j.tool.Scan

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.