Package it.unimi.dsi.mg4j.tool

Source Code of it.unimi.dsi.mg4j.tool.Paste$DocumentIndexComparator

package it.unimi.dsi.mg4j.tool;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2005-2010 Sebastiano Vigna
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/

import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.ints.IntHeapPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.mg4j.index.CachingOutputBitStream;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Map;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.JSAPException;

/** Pastes several indices.
*
* <p>Pasting is a very slow way of combining indices: we assume
* that not only documents, but also document occurrences might be scattered
* throughout several indices. When a document appears in several indices,
* its occurrences in a given index are combined. We have two possibilities:
* <ul>
* <li><em>standard</em> pasting: position lists are simply concatenated&mdash;it
* is responsibility of the caller to guarantee that they have been numbered
* in an increasing fashion; the sizes of the last input index are the sizes of
* the pasted index;
* <li><em>incremental</em> pasting: position lists are concatenated, but each
* list is renumbered by adding to all positions the sum of the sizes of the
* current document for all indices the precede the current one (this kind
* of pasting was the only one available before version 3.0).
* </ul>
*
* <p>Standard pasting is used, for instance, to paste the batches of a
* {@linkplain it.unimi.dsi.mg4j.document.DocumentFactory.FieldType#VIRTUAL virtual field}
* generated by {@link Scan}; the latter takes care of numbering positions
* correctly. If, however, you index parts of the same document collection on
* different machines using the same {@link VirtualDocumentResolver}, then
* the resulting indices for virtual fields will
* have all position starting from zero, and they will need an incremental
* pasting to be combined correctly.
*
* <p>Conceptually, this operation is equivalent to splitting a collection
* <em>vertically</em>: each document is divided into a fixed number <var>n</var>
* of consecutive segments (possibly of length 0), and a set of <var>n</var> indices
* is created using the <var>k</var>-th segment of all documents. Pasting the
* resulting indices will produce an index that is identical to the index generated
* by the original collection. The behaviour is analogous to that of the UN*X
* <samp>paste</samp> command if documents are single-line lists of words.
*
* <p>Note that in case every document appears at most in one index pasting
* is equivalent to {@linkplain it.unimi.dsi.mg4j.tool.Merge merging}. It is, however,
* significantly slower, as the presence of the same document in several lists makes
* it necessary to scan completely the inverted lists to be pasted to compute the
* frequency. To do so, an in-memory buffer is allocated. If an inverted list does not fit
* in the memory buffer, it is spilled on disk. Sizing correctly the buffer, and choosing a fast
* file system for the temporary directory can significantly affect performance.
*
* <p><strong>Warning</strong>: incremental pasting is very memory-intensive, as
* a list of sizes must be loaded for each index. You can use the URI option
* <samp>succinctsizes=1</samp> to load sizes in a succinct format, which will
* ease the problem.
*  
* @author Sebastiano Vigna
* @since 1.0
*/

final public class Paste extends Combine {
  @SuppressWarnings("unused")
  private static final Logger LOGGER = Util.getLogger( Paste.class );
 
  /** The default size of the temporary bit stream buffer used while pasting. Posting lists larger
   * than this size will be precomputed on disk and then added to the index. */
  public final static int DEFAULT_MEMORY_BUFFER_SIZE = 16 * 1024 * 1024;
 
  /** The reference array of the document queue. */
  protected final int[] doc;
  /** Whether this paste is incremental. */
  private final boolean incremental;
  /** The queue containing document pointers (for remapped indices). */
  protected final IntHeapPriorityQueue documentQueue;
  /** The temporary cache file {@link #combine(int)}. */
  private final File tempFile;
  /** The temporary output bit stream for {@link #combine(int)}. */
  private final CachingOutputBitStream cacheBitStreamOut;
  /** The temporary output bit stream for {@link #combine(int)}. */
  private final InputBitStream cacheBitStreamIn;
  /** The input bit stream used to wrap directly {@link #cacheBitStreamOut}'s buffer. */
  private final InputBitStream cacheBitStreamInWrapper;
  /** The size of the size list for each index. */
  private final int[] sizesSize;
 
  /** Pastes several indices into one.
   *
   * @param outputBasename the basename of the combined index.
   * @param inputBasename the basenames of the input indices.
   * @param metadataOnly if true, we save only metadata (term list, frequencies, global counts).
   * @param incremental if true, we perform an incremental paste (needs sizes).
   * @param bufferSize the buffer size for index readers.
   * @param tempFileDir the directory of the temporary file used when pasting.
   * @param tempBufferSize the size of the in-memory buffer used when pasting.
   * @param writerFlags the flags for the index writer.
   * @param interleaved forces an interleaved index.
   * @param skips whether to insert skips in case <code>interleaved</code> is true.
   * @param quantum the quantum of skipping structures; if negative, a percentage of space for variable-quantum indices (irrelevant if <code>skips</code> is false).
   * @param height the height of skipping towers (irrelevant if <code>skips</code> is false).
   * @param skipBufferSize the size of the buffer used to hold temporarily inverted lists during the skipping structure construction.
   * @param logInterval how often we log.
   */
  public Paste( final String outputBasename,
      final String[] inputBasename,
      final boolean metadataOnly,
      final boolean incremental,
      final int bufferSize,
      final File tempFileDir,
      final int tempBufferSize,
      final Map<Component,Coding> writerFlags,
      final boolean interleaved,
      final boolean skips,
      final int quantum,
      final int height,
      final int skipBufferSize,
      final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    super( outputBasename, inputBasename, metadataOnly, incremental, bufferSize, writerFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval );
    this.incremental = incremental;

    tempFile = File.createTempFile( "MG4J", ".data", tempFileDir );
    cacheBitStreamOut = new CachingOutputBitStream( tempFile, tempBufferSize );
    cacheBitStreamIn = new InputBitStream( tempFile, bufferSize );
    cacheBitStreamInWrapper = new InputBitStream( cacheBitStreamOut.buffer() );
    /* In this case, we must reallocate position as by merging occurences we might
     * obtain an occurrence list as large as the concatenation of all largest
     * lists. We use this estimate to allocate position, and update maxCount in
     * combine() to get the real maxCount. */
    int estimateForMaxCount = 0, tempSize = 0;
    sizesSize = incremental ? new int[ numIndices ] : null;
    if ( incremental ) for( int i = index.length; i-- != 0; ) sizesSize[ i ] = index[ i ].sizes.size();
 
    for( int i = 0; i < numIndices; i++ ) {
      if ( index[ i ].hasPayloads ) throw new IllegalArgumentException( "You cannot paste indices with payloads" );
      estimateForMaxCount += index[ i ].maxCount;
      tempSize = Math.max( tempSize, index[ i ].maxCount );
    }

    if ( hasPositions ) position = new int[ estimateForMaxCount ];
    doc = new int[ numIndices ];
    documentQueue = new IntHeapPriorityQueue( numIndices, new DocumentIndexComparator( doc ) );
  }

  /** A comparator making an integer priority queue work much like an indirect
   * priority queue, with the additional property of using the reference index as secondary key.
   */
 
  private final static class DocumentIndexComparator extends AbstractIntComparator {
    private final int[] refArray;

    public DocumentIndexComparator( final int[] refArray ) {
      this.refArray = refArray;
    }
    
    public int compare( final int i, final int j ) {
      final int t = refArray[ i ] - refArray[ j ];
      return t != 0 ? t : i - j;
    }
  }
 
 
  protected int combineNumberOfDocuments() {
    int n = 0;
    for( int i = 0; i < numIndices; i++ ) n = Math.max( n, index[ i ].numberOfDocuments );
    return n;
  }

  protected int combineSizes( final OutputBitStream sizesOutputBitStream ) throws IOException {
    int currDoc = 0, maxDocSize = 0;
   
    if ( incremental ) {
      // We accumulate document sizes in an array.
      size = new int[ numberOfDocuments ];
      for( int i = 0; i < numIndices; i++ ) {
        final IntIterator sizes = sizes( i );
        int j = index[ i ].numberOfDocuments;
        currDoc = 0;
        while( j-- != 0 ) maxDocSize = Math.max( maxDocSize, size[ currDoc++ ] += sizes.nextInt() );
        if ( sizes instanceof Closeable ) ((Closeable)sizes).close();
      }
      // We write the array.
      for( int s: size ) sizesOutputBitStream.writeGamma( s );
      // We keep it if we need sizes.
      if ( ! needsSizes ) size = null;
    }
    else {
      if ( needsSizes ) size = new int[ numberOfDocuments ];
      final IntIterator sizes = sizes( numIndices - 1 );
      int s = 0;
      // We copy the last file size, and store the elements in an array if needsSizes is true.
      for( int j = 0; j < numberOfDocuments; j++ ) {
        s = sizes.nextInt();
        if ( needsSizes ) size[ j ] = s;
        maxDocSize = Math.max( maxDocSize, s );
        sizesOutputBitStream.writeGamma( s );
      }
      if ( sizes instanceof Closeable ) ((Closeable)sizes).close();
      // We keep the array if we need sizes.
      if ( ! needsSizes ) size = null;
    }
    return maxDocSize;
  }


  protected int combine( final int numUsedIndices ) throws IOException {
    /* If we're merging just one list, merging is fine, and moreover
     * maxCount need not be updated, as it is already initialised to
     * the maximum over all indices. */
    int currIndex, prevDoc = -1, currDoc, count;
    int temp[];
    OutputBitStream obs;
    Index i;
    IndexIterator ii;
 
    // Note that the total frequency can be computed only during the merge.
    for( int k = numUsedIndices; k-- != 0; ) {
      currIndex = usedIndex[ k ];
      frequency[ currIndex ] = indexIterator[ currIndex ].frequency();
      doc[ currIndex ] = indexIterator[ currIndex ].nextDocument();
      documentQueue.enqueue( currIndex );
    }
   
    // First phase: we write the inverted list using a quick-and-dirty format in the cache.
    cacheBitStreamOut.position( 0 );
    int  totalFrequency = 0, increment, prevIndex, totalCount;
   
    while( ! documentQueue.isEmpty() ) {
      // We extract the smallest document pointer, and enqueue it in the new index.
      currDoc = doc[ currIndex = documentQueue.firstInt() ];
      totalFrequency++;
      if ( ! metadataOnly ) cacheBitStreamOut.writeDelta( currDoc - prevDoc - 1 );
     
      totalCount = prevIndex = increment = 0;
     
      do {
        if ( incremental)
          while( prevIndex < currIndex ) {
            /* Note that some virtual documents could not exist at all in some index (in which
             * case we extend the size list with zeroes). */
            if ( sizesSize[ prevIndex ] > currDoc ) increment += index[ prevIndex ].sizes.getInt( currDoc );
            prevIndex++;
          }
        i = index[ currIndex ];

        i = index[ currIndex ];
        ii = indexIterator[ currIndex ];
     
        if ( ! metadataOnly && i.hasCounts ) {
          count = ii.count();
          if ( i.hasPositions ) {
            temp = ii.positionArray();
            if ( ! incremental && totalCount > 0 && temp[ 0 ] <= position[ totalCount - 1 ] ) throw new IllegalStateException( "Positions in document " + currDoc + " are not increasing; you probably need to require an incremental pasting" );
            for( int k = count; k-- != 0; ) position[ totalCount + k ] = temp[ k ] + increment;
          }
          totalCount += count;
        }
       
        // If we just wrote the last document pointer of this term in index j, we dequeue it.
        if ( --frequency[ currIndex ] == 0 ) documentQueue.dequeue();
        else {
          doc[ currIndex ] = ii.nextDocument();
          documentQueue.changed();
        }
      } while( ! documentQueue.isEmpty() && doc[ currIndex = documentQueue.firstInt() ] == currDoc );
 
      if ( totalCount > maxCount ) maxCount = totalCount;
 
      if ( ! metadataOnly && hasCounts ) {
        cacheBitStreamOut.writeGamma( totalCount );
        if ( hasPositions ) {
          cacheBitStreamOut.writeDelta( position[ 0 ] );
          for( int k = 1; k < totalCount; k++ ) cacheBitStreamOut.writeDelta( position[ k ] - position[ k - 1 ] - 1 );
        }
      }
 
      prevDoc = currDoc;
    }
 
    if ( ! metadataOnly ) {
      // Finally, we pour the data into the actual index.

      if ( p != 0 ) variableQuantumIndexWriter.newInvertedList( totalFrequency, p, predictedSize, predictedLengthNumBits );
      else indexWriter.newInvertedList();

      indexWriter.writeFrequency( totalFrequency );
      cacheBitStreamOut.align();
      final InputBitStream ibs;

      if ( cacheBitStreamOut.buffer() != null ) ibs = cacheBitStreamInWrapper;
      else {
        cacheBitStreamOut.flush();
        ibs = cacheBitStreamIn;
        ibs.flush();
      }

      ibs.position( 0 );

      currDoc = -1;
      for( int j = totalFrequency; j-- != 0; ) {
        obs = indexWriter.newDocumentRecord();
        indexWriter.writeDocumentPointer( obs, currDoc = ibs.readDelta() + currDoc + 1 );
        if ( hasCounts ) {
          count = ibs.readGamma();
          indexWriter.writePositionCount( obs, count );
          if ( hasPositions ) {
            position[ 0 ] = ibs.readDelta();
            for( int k = 1; k < count; k++ ) position[ k ] = position[ k - 1 ] + ibs.readDelta() + 1;
            indexWriter.writeDocumentPositions( obs, position, 0, count, size != null ? size[ currDoc ] : -1 );
          }
        }
      }

    }
   
    return totalFrequency;
  }
 
  public void run() throws ConfigurationException, IOException {
    super.run();
    cacheBitStreamOut.close();
    tempFile.delete();
  }

  public static void main( String arg[] ) throws ConfigurationException, SecurityException, JSAPException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    Combine.main( arg, Paste.class );
  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.tool.Paste$DocumentIndexComparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.