Package it.unimi.dsi.mg4j.document

Examples of it.unimi.dsi.mg4j.document.Document

        final int document = dsi.document;
        output.print( "Document #" + document );
        output.printf( " [%.6f]", dsi.score );
        Document d = null; // Filled lazily
        // We try to print a title, preferring the supplied title list if present
        if ( titleList != null ) output.println( " " + titleList.get( document ) );
        else if ( documentCollection != null ) {
          d = documentCollection.document( document );
          output.println( " " + d.title().toString().trim() );
        else output.println();
        if ( ( displayMode == OutputType.LONG || displayMode == OutputType.SNIPPET ) && != null && queryEngine.intervalSelector != null ) {
          final Index[] sortedIndex = new Index[ 0 ] );
          if ( documentCollection != null ) Arrays.sort( sortedIndex, new Comparator<Index>() {
            public int compare( final Index i0, final Index i1 ) {
              return documentCollection.factory().fieldIndex( i0.field ) - documentCollection.factory().fieldIndex( i1.field );
            }} );
          for( Index index: sortedIndex )
            if ( index.hasPositions ) {
              SelectedInterval[] interval = index );
              if ( interval == SelectedInterval.TRUE_ARRAY ) output.println( index.field + ": TRUE" );
              else if ( interval == SelectedInterval.FALSE_ARRAY ) output.println( index.field + ": FALSE" );
              else if ( displayMode == OutputType.LONG || documentCollection == null ) output.println( index.field + ": " + Arrays.toString( interval ) );
              else { // SNIPPET_MODE
                final MarkingMutableString s = new MarkingMutableString( marker );
                s.startField( interval );
                // TODO: this must be in increasing field order
                if ( d == null ) d = documentCollection.document( document );
                int fieldIndex = documentCollection.factory().fieldIndex( index.field );
                if ( fieldIndex == -1 || documentCollection.factory().fieldType( fieldIndex ) != DocumentFactory.FieldType.TEXT ) continue;
                final Reader reader = (Reader)d.content( fieldIndex );
                s.appendAndMark( d.wordReader( fieldIndex ).setReader( reader ) );
                output.println( index.field + ": " + s.toString() );
            else if ( index.hasPayloads && index ) == SelectedInterval.TRUE_ARRAY ) {
              if ( d == null ) d = documentCollection.document( document );
              int fieldIndex = documentCollection.factory().fieldIndex( index.field );
              if ( fieldIndex == -1 ) continue;
              output.println( d.content( fieldIndex ) );
            LOGGER.debug( "Intervals for item " + i );
            final ResultItem resultItem = new ResultItem( dsi.document, dsi.score );
            resultItems.add( resultItem );

            if ( collection != null ) {
              final Document document = collection.document( dsi.document );
              // If both collection and title list are present, we override the collection title (cfr. Query)
              resultItem.title = StringEscapeUtils.escapeHtml( titleList != null ? titleList.get( resultItem.doc ).toString() : document.title().toString() );
              if ( useUri ) {
                if ( document.uri() != null ) resultItem.uri = StringEscapeUtils.escapeHtml( document.uri().toString() );
              else {
                if ( document.uri() != null ) {
                  String stringUri = document.uri().toString();
                  // TODO: this is a quick patch to get the file server running with relative files
                  final String documentUri = URLEncoder.encode( derelativise
                  ? new File( stringUri.startsWith( "file:" ) ? stringUri.substring( 5 ) : stringUri ).getAbsoluteFile().toURI().toASCIIString()
                      : document.uri().toString(), "UTF-8" );
                  resultItem.uri = StringEscapeUtils.escapeHtml( "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType + "&uri=" + documentUri );
                else resultItem.uri = StringEscapeUtils.escapeHtml( "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType );
              MarkingMutableString snippet = new MarkingMutableString( TextMarker.HTML_STRONG, MarkingMutableString.HTML_ESCAPE );
              for( int j = 0; j < sortedIndex.length; j++ ) {
                if ( ! sortedIndex[ j ].hasPositions || == null ) continue;
                selectedInterval = sortedIndex[ j ] );
                if ( selectedInterval != null ) {
                  final int field = documentCollection.factory().fieldIndex( sortedIndex[ j ].field );
                  // If the field is not present (e.g., because of parallel indexing) or it is not text we skip
                  if ( field == -1 || documentCollection.factory().fieldType( field ) != DocumentFactory.FieldType.TEXT ) continue;
                  LOGGER.debug( "Found intervals for " + sortedIndex[ j ].field + " (" + field + ")" );
                  final Reader content = (Reader)document.content( field );
                  snippet.startField( selectedInterval ).appendAndMark( document.wordReader( field ).setReader( content ) ).endField();
                if ( LOGGER.isDebugEnabled() ) LOGGER.debug( sortedIndex[ j ].field + ": " + ( selectedInterval == null ? null : Arrays.asList( selectedInterval ) ) );
              resultItem.text = snippet;
            else {
    DocumentIterator iterator = documentSequence.iterator();
    Reader reader;
    WordReader wordReader;
    ObjectList<VirtualDocumentFragment> fragments;
    Document document;

    int documentPointer = 0, documentsInBatch = 0;
    long batchStartTime = System.currentTimeMillis();
    boolean outOfMemoryError = false;

    while ( ( document = iterator.nextDocument() ) != null ) {
      long overallTerms = 0;
      if ( building ) builder.startDocument( document.title(), document.uri() );
      for ( int i = 0; i < numberOfIndexedFields; i++ ) {
        switch ( factory.fieldType( indexedField[ i ] ) ) {
        case TEXT:
          reader = (Reader)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          wordReader.setReader( reader );
          if ( building ) builder.startTextField();
          scan[ i ].processDocument( map != null ? map[ documentPointer ] : documentPointer, wordReader );
          if ( building ) builder.endTextField();
          overallTerms += scan[ i ].numTerms;
        case VIRTUAL:
          fragments = (ObjectList<VirtualDocumentFragment>)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          virtualDocumentResolver[ i ].context( document );
          for( VirtualDocumentFragment fragment: fragments ) {
            int virtualDocumentPointer = virtualDocumentResolver[ i ].resolve( fragment.documentSpecifier() );
            if ( virtualDocumentPointer < 0 ) continue;
            if ( map != null ) virtualDocumentPointer = map[ virtualDocumentPointer ];
            wordReader.setReader( new FastBufferedReader( fragment.text() ) );
            scan[ i ].processDocument( virtualDocumentPointer, wordReader );
          if ( building ) builder.virtualField( fragments );
          overallTerms += scan[ i ].numTerms;
          Object o = document.content( indexedField[ i ] );
          accumulator[ i ].processData( map != null ? map[ documentPointer ] : documentPointer, o );
          if ( building ) builder.nonTextField( o );

        if ( scan[ i ] != null && scan[ i ].outOfMemoryError ) outOfMemoryError = true;
      if ( building ) builder.endDocument();

      // We try compaction if we detect less than PERC_AVAILABLE_MEMORY_CHECK memory available
      long percAvailableMemory = Util.percAvailableMemory();
      boolean compacted = false;
  public Template handleRequest( final HttpServletRequest request, final HttpServletResponse response, final Context context ) throws Exception {
    if ( request.getParameter( "doc" ) != null ) {
      DocumentCollection collection = (DocumentCollection)getServletContext().getAttribute( "collection" );
      response.setContentType( request.getParameter( "m" ) );
      response.setCharacterEncoding( "UTF-8" );
      final Document document = collection.document( Integer.parseInt( request.getParameter( "doc" ) ) );
      final DocumentFactory factory = collection.factory();
      final ObjectArrayList<String> fields = new ObjectArrayList<String>();
      final int numberOfFields = factory.numberOfFields();
      LOGGER.debug( "ParsingFactory declares " + numberOfFields + " fields"  );
      for( int field = 0; field < numberOfFields; field++ ) {
        if ( factory.fieldType( field ) != FieldType.TEXT ) fields.add( StringEscapeUtils.escapeHtml( document.content( field ).toString() ) );
        else fields.add( StringEscapeUtils.escapeHtml( IOUtils.toString( (Reader)document.content( field ) ) ).replaceAll( "\n", "<br>\n" ) );
      context.put( "title", document.title() );
      context.put( "fields", fields );
      context.put( "factory", factory );
      return getTemplate( "it/unimi/dsi/mg4j/query/generic.velocity" );
    final DocumentIterator documentIterator = documentSequence.iterator();

    Document document;
    FastBufferedOutputStream uriStream = null, titleStream = null;
    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );
    if ( jsapResult.userSpecified( "titles" ) ) titleStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "titles" ) ) );
    MutableString s = new MutableString();

    ProgressLogger progressLogger = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "documents" );
    if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();
    progressLogger.start( "Scanning..." );
    while( ( document = documentIterator.nextDocument() ) != null ) {
      if ( uriStream != null ) {
        s.replace( document.uri() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( uriStream );
        uriStream.write( '\n' );
      if ( titleStream != null ) {
        s.replace( document.title() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( titleStream );
        titleStream.write( '\n' );
      pl.itemsName = "documents";
      pl.start( "Verifying random access..." );

      if ( allBitStreamIndices ) {
        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        final MutableString word = new MutableString(), nonWord = new MutableString();
        int docCounter = 0;
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              else {
                IndexIterator indexIterator = indexReader[ i ].documents);
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;

              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();

                  IndexIterator indexIterator = indexReader[ i ].documents( t );

                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );

                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );

        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        final MutableString word = new MutableString(), nonWord = new MutableString();
        int docCounter = 0;
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              else {
                IndexIterator indexIterator = indexReader[ i ].documents( "#" );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
            else {
              pos = 0;
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                IndexIterator indexIterator = indexReader[ i ].documents( word );
                if ( currDoc != indexIterator.skipTo( currDoc ) )
                  LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + word );
                else if ( index[ i ].hasPositions ) {
                  indexIterator.positions( occ[ i ] );
                  if ( IntArrayList.wrap( occ[ i ], indexIterator.count() ).indexOf( pos ) == -1 )
                    LOGGER.error( index[ i ] + ": Position " + pos + " does not appear in the position list of term " + word + " in document " + currDoc );
  public void checkAgainstContent( DocumentSequence sequence, int[] map, VirtualDocumentResolver resolver, int gap, Index... index ) throws IOException {
    DocumentIterator iterator = sequence.iterator();
    DocumentFactory factory = sequence.factory();
    Document document;
    final int n = index.length;
    final int[] field = new int[ n ];
    final int[][] currMaxPos = new int[ n ][];
    final int[] maxDoc = new int[ n ];
    IntArrays.fill( maxDoc, -1 );
    final Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>[] termMap = new Object2ObjectOpenHashMap[ n ];
    final IntArrayList[] payloadPointers = new IntArrayList[ n ];
    final ObjectArrayList<Object>[] payloadContent = new ObjectArrayList[ n ];

    for ( int i = 0; i < n; i++ ) {
      field[ i ] = factory.fieldIndex( index[ i ].field );
      switch ( factory.fieldType( field[ i ] ) ) {
      case VIRTUAL:
        currMaxPos[ i ] = new int[ resolver.numberOfDocuments() ];
      case TEXT:
        termMap[ i ] = new Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>();
      case DATE:
      case INT:
        payloadPointers[ i ] = new IntArrayList();
        payloadContent[ i ] = new ObjectArrayList<Object>();

    int documentIndex = 0;

    while ( ( document = iterator.nextDocument() ) != null ) {
      for ( int i = 0; i < field.length; i++ ) {
        switch ( factory.fieldType( field[ i ] ) ) {
        case TEXT:
          processDocument( document.wordReader( field[ i ] ).setReader( (Reader)document.content( field[ i ] ) ), map == null ? documentIndex : map[ documentIndex ], 0, termMap[ i ],
              index[ i ].termProcessor );
        case VIRTUAL:
          ObjectArrayList<VirtualDocumentFragment> fragments = (ObjectArrayList<VirtualDocumentFragment>)document.content( field[ i ] );
          resolver.context( document );
          for ( VirtualDocumentFragment fragment : fragments ) {
            int d = resolver.resolve( fragment.documentSpecifier() );

            if ( d != -1 ) {
              if ( map != null ) d = map[ d ];
              if ( maxDoc[ i ] < d ) maxDoc[ i ] = d;
              currMaxPos[ i ][ d ] = processDocument( document.wordReader( field[ i ] ).setReader( new FastBufferedReader( fragment.text() ) ), d, currMaxPos[ i ][ d ], termMap[ i ],
                  index[ i ].termProcessor )
                  + gap;
        case INT:
        case DATE:
          Object x = document.content( field[ i ] );
          if ( x != null ) {
            payloadPointers[ i ].add( map == null ? documentIndex : map[ documentIndex ] );
            payloadContent[ i ].add( x );


  public int size() {
    return virtual.length;

  public Document document( final int index ) {
    return new Document() {
      public void close() {}
      public Object content( int field ) throws IOException {
        ensureDocumentIndex( index );
        ObjectArrayList<Anchor> result = new ObjectArrayList<Anchor>();
        for( Map.Entry<Integer, ? extends CharSequence> entry: virtual[ index ].entrySet() )
        4, // Very small, to induce fragmentation

    try {
      DocumentIterator iter = collection.iterator();
      Document d;
      while ((d = iter.nextDocument()) != null)
    } catch (IllegalStateException e) {

                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation

    DocumentIterator iter = collection.iterator();
    Document d = null;

    d = iter.nextDocument();
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

    final int textIndex = collection.factory().fieldIndex( "text" );
    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d = iter.nextDocument();
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());

    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());

    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());

    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d = iter.nextDocument();
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());

    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());

    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());

    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    d = collection.document( 0 );
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d = collection.document( 1 );
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());

    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = collection.document( 2 );
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());

    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = collection.document( 3 );
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());

    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d = collection.document( 4 );
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());

    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = collection.document( 5 );
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());

    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = collection.document( 6 );
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());

    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );

