Examples of it.unimi.dsi.mg4j.document.DocumentIterator

it.unimi.dsi.mg4j.document.DocumentIterator
An iterator over documents.
This interface provide a {@link #nextDocument()}method returning the next document, or null if no more documents are available. Usually you would need to {@link Document#close()} each document when youare finished with it, but in the present case it is guaranteed that each call to {@link DocumentIterator#nextDocument()}will close the previously returned document.
An additional {@link #close()} method releases all resourcesused by the iterator. Implementations are invited to be {@link it.unimi.dsi.io.SafelyCloseable safely closeable}.

    if ( building ) builder.open( "@0" ); // First batch
    
    pl.displayFreeMemory = true;
    pl.start( "Indexing documents..." );


    DocumentIterator iterator = documentSequence.iterator();
    Reader reader;
    WordReader wordReader;
    ObjectList<VirtualDocumentFragment> fragments;
    Document document;


    int documentPointer = 0, documentsInBatch = 0;
    long batchStartTime = System.currentTimeMillis();
    boolean outOfMemoryError = false;


    while ( ( document = iterator.nextDocument() ) != null ) {
      
      long overallTerms = 0;
      if ( building ) builder.startDocument( document.title(), document.uri() );
      for ( int i = 0; i < numberOfIndexedFields; i++ ) {
        switch ( factory.fieldType( indexedField[ i ] ) ) {
        case TEXT:
          reader = (Reader)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          wordReader.setReader( reader );
          if ( building ) builder.startTextField();
          scan[ i ].processDocument( map != null ? map[ documentPointer ] : documentPointer, wordReader );
          if ( building ) builder.endTextField();
          overallTerms += scan[ i ].numTerms;
          break;
        case VIRTUAL:
          fragments = (ObjectList<VirtualDocumentFragment>)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          virtualDocumentResolver[ i ].context( document );
          for( VirtualDocumentFragment fragment: fragments ) {
            int virtualDocumentPointer = virtualDocumentResolver[ i ].resolve( fragment.documentSpecifier() );
            if ( virtualDocumentPointer < 0 ) continue;
            if ( map != null ) virtualDocumentPointer = map[ virtualDocumentPointer ];
            wordReader.setReader( new FastBufferedReader( fragment.text() ) );
            scan[ i ].processDocument( virtualDocumentPointer, wordReader );
          }
          if ( building ) builder.virtualField( fragments );
          overallTerms += scan[ i ].numTerms;
          break;
        default:
          Object o = document.content( indexedField[ i ] );
          accumulator[ i ].processData( map != null ? map[ documentPointer ] : documentPointer, o );
          if ( building ) builder.nonTextField( o );
          break;
        }


        if ( scan[ i ] != null && scan[ i ].outOfMemoryError ) outOfMemoryError = true;
      }
      if ( building ) builder.endDocument();
      documentPointer++;
      documentsInBatch++;
      document.close();
      pl.update();


      // We try compaction if we detect less than PERC_AVAILABLE_MEMORY_CHECK memory available
      long percAvailableMemory = Util.percAvailableMemory();
      boolean compacted = false;
      if ( ! outOfMemoryError && percAvailableMemory < PERC_AVAILABLE_MEMORY_CHECK ) {
        LOGGER.info( "Starting compaction... (" + percAvailableMemory + "% available)" );
        compacted = true;
        Util.compactMemory();
        percAvailableMemory = Util.percAvailableMemory();
        LOGGER.info( "Compaction completed (" + percAvailableMemory + "% available)" );
      }
      
      if ( outOfMemoryError || overallTerms >= maxTerms || documentsInBatch == documentsPerBatch || ( compacted && percAvailableMemory < PERC_AVAILABLE_MEMORY_DUMP ) ) {
        if ( outOfMemoryError ) LOGGER.warn( "OutOfMemoryError during buffer reallocation: writing a batch of " + documentsInBatch + " documents" );
        else if ( overallTerms >= maxTerms ) LOGGER.warn( "Too many terms (" + overallTerms + "): writing a batch of " + documentsInBatch + " documents" );
        else if ( compacted && percAvailableMemory < PERC_AVAILABLE_MEMORY_DUMP ) LOGGER.warn( "Available memory below " + PERC_AVAILABLE_MEMORY_DUMP + "%: writing a batch of " + documentsInBatch + " documents" );


        long occurrences = 0;
        for ( int i = 0; i < numberOfIndexedFields; i++ ) {
          switch ( factory.fieldType( indexedField[ i ] ) ) {
          case TEXT:
          case VIRTUAL:
            occurrences += scan[ i ].dumpBatch();
            scan[ i ].openSizeBitStream();
            break;
          default:
            accumulator[ i ].writeData();
          }
        }
        
        if ( building ) {
          builder.close();
          builder.open( "@" + scan[ 0 ].batch );
        }


        LOGGER.info( "Last set of batches indexed at " + Util.format( ( 1000. * occurrences ) / ( System.currentTimeMillis() - batchStartTime ) ) + " occurrences/s" );
        batchStartTime = System.currentTimeMillis();
        documentsInBatch = 0;
        outOfMemoryError = false;
      }
    }


    iterator.close();
    if ( builder != null ) builder.close();


    for ( int i = 0; i < numberOfIndexedFields; i++ ) {
      switch ( factory.fieldType( indexedField[ i ] ) ) {
      case TEXT:

View Full Code Here

    if ( ! jsapResult.userSpecified( "uris" ) && ! jsapResult.userSpecified( "titles" ) ) 
      throw new IllegalArgumentException( "You specify either a title or a URI output file" );
    
    Util.ensureLog4JIsConfigured();


    final DocumentIterator documentIterator = documentSequence.iterator();


    Document document;
    FastBufferedOutputStream uriStream = null, titleStream = null;
    
    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );
    if ( jsapResult.userSpecified( "titles" ) ) titleStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "titles" ) ) );
    
    MutableString s = new MutableString();


    ProgressLogger progressLogger = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "documents" );
    if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();
    progressLogger.start( "Scanning..." );
    
    while( ( document = documentIterator.nextDocument() ) != null ) {
      if ( uriStream != null ) {
        s.replace( document.uri() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( uriStream );
        uriStream.write( '\n' );

View Full Code Here

   * immaterial if no field is virtual).
   * @param index a list of indices that have indexed one or more fields of <code>sequence</code>.
   */
  @SuppressWarnings("unchecked")
  public void checkAgainstContent( DocumentSequence sequence, int[] map, VirtualDocumentResolver resolver, int gap, Index... index ) throws IOException {
    DocumentIterator iterator = sequence.iterator();
    DocumentFactory factory = sequence.factory();
    Document document;
    final int n = index.length;
    final int[] field = new int[ n ];
    final int[][] currMaxPos = new int[ n ][];
    final int[] maxDoc = new int[ n ];
    IntArrays.fill( maxDoc, -1 );
    final Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>[] termMap = new Object2ObjectOpenHashMap[ n ];
    final IntArrayList[] payloadPointers = new IntArrayList[ n ];
    final ObjectArrayList<Object>[] payloadContent = new ObjectArrayList[ n ];


    for ( int i = 0; i < n; i++ ) {
      field[ i ] = factory.fieldIndex( index[ i ].field );
      switch ( factory.fieldType( field[ i ] ) ) {
      case VIRTUAL:
        currMaxPos[ i ] = new int[ resolver.numberOfDocuments() ];
      case TEXT:
        termMap[ i ] = new Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>();
        break;
      case DATE:
      case INT:
        payloadPointers[ i ] = new IntArrayList();
        payloadContent[ i ] = new ObjectArrayList<Object>();
      }
    }


    int documentIndex = 0;


    while ( ( document = iterator.nextDocument() ) != null ) {
      for ( int i = 0; i < field.length; i++ ) {
        switch ( factory.fieldType( field[ i ] ) ) {
        case TEXT:
          processDocument( document.wordReader( field[ i ] ).setReader( (Reader)document.content( field[ i ] ) ), map == null ? documentIndex : map[ documentIndex ], 0, termMap[ i ],
              index[ i ].termProcessor );
          break;
        case VIRTUAL:
          ObjectArrayList<VirtualDocumentFragment> fragments = (ObjectArrayList<VirtualDocumentFragment>)document.content( field[ i ] );
          resolver.context( document );
          for ( VirtualDocumentFragment fragment : fragments ) {
            int d = resolver.resolve( fragment.documentSpecifier() );


            if ( d != -1 ) {
              if ( map != null ) d = map[ d ];
              if ( maxDoc[ i ] < d ) maxDoc[ i ] = d;
              currMaxPos[ i ][ d ] = processDocument( document.wordReader( field[ i ] ).setReader( new FastBufferedReader( fragment.text() ) ), d, currMaxPos[ i ][ d ], termMap[ i ],
                  index[ i ].termProcessor )
                  + gap;
            }
          }
          break;
        case INT:
        case DATE:
          Object x = document.content( field[ i ] );
          if ( x != null ) {
            payloadPointers[ i ].add( map == null ? documentIndex : map[ documentIndex ] );
            payloadContent[ i ].add( x );
          }
        default:
        }
      }
      document.close();
      documentIndex++;
    }


    iterator.close();


    for ( int i = 0; i < n; i++ ) {
      if ( termMap[ i ] != null ) for ( ObjectArrayList<int[]> list : termMap[ i ].values() ) {
        // We sort in all cases, just to reduce the possible execution paths
        Collections.sort( list, new Comparator<int[]>() {

View Full Code Here

                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);


    try {
      DocumentIterator iter = collection.iterator();
      Document d;
      while ((d = iter.nextDocument()) != null)
        d.title();
    } catch (IllegalStateException e) {
      assertTrue(false);
    }

View Full Code Here

                new TRECHeaderDocumentFactory(),
                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);


    DocumentIterator iter = collection.iterator();
    Document d = null;


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());


    final int textIndex = collection.factory().fieldIndex( "text" );
    
    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());


    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());


    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());


    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());


    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());


    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());


    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNull(d);
    iter.close();
    
    d = collection.document( 0 );
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

View Full Code Here

TOP

Related Classes of it.unimi.dsi.mg4j.document.DocumentIterator

it.unimi.dsi.mg4j.document.TRECDocumentCollectionTest

it.unimi.dsi.mg4j.tool.IndexTest

it.unimi.dsi.mg4j.tool.Scan

it.unimi.dsi.mg4j.tool.ScanMetadata

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.