Package it.unimi.dsi.mg4j.tool

Source Code of it.unimi.dsi.mg4j.tool.IndexTest$KillATermProcessor

package it.unimi.dsi.mg4j.tool;

import static it.unimi.dsi.logging.ProgressLogger.DEFAULT_LOG_INTERVAL;
import static it.unimi.dsi.mg4j.index.CompressionFlags.DEFAULT_PAYLOAD_INDEX;
import static it.unimi.dsi.mg4j.index.DiskBasedIndex.FREQUENCIES_EXTENSION;
import static it.unimi.dsi.mg4j.index.DiskBasedIndex.GLOBCOUNTS_EXTENSION;
import static it.unimi.dsi.mg4j.index.DiskBasedIndex.INDEX_EXTENSION;
import static it.unimi.dsi.mg4j.index.DiskBasedIndex.OFFSETS_EXTENSION;
import static it.unimi.dsi.mg4j.index.DiskBasedIndex.PROPERTIES_EXTENSION;
import static it.unimi.dsi.mg4j.index.DiskBasedIndex.SIZES_EXTENSION;
import static it.unimi.dsi.mg4j.index.DiskBasedIndex.TERMMAP_EXTENSION;
import static it.unimi.dsi.mg4j.index.DiskBasedIndex.TERMS_EXTENSION;
import it.unimi.dsi.Util;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.Swapper;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntIterators;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectRBTreeSet;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.io.FileLinesCollection.FileLinesIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.mg4j.document.AbstractDocumentSequence;
import it.unimi.dsi.mg4j.document.CompositeDocumentSequence;
import it.unimi.dsi.mg4j.document.DateArrayDocumentCollection;
import it.unimi.dsi.mg4j.document.Document;
import it.unimi.dsi.mg4j.document.DocumentCollection;
import it.unimi.dsi.mg4j.document.DocumentFactory;
import it.unimi.dsi.mg4j.document.DocumentIterator;
import it.unimi.dsi.mg4j.document.DocumentSequence;
import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;
import it.unimi.dsi.mg4j.document.InputStreamDocumentSequence;
import it.unimi.dsi.mg4j.document.IntArrayDocumentCollection;
import it.unimi.dsi.mg4j.document.MapVirtualDocumentCollection;
import it.unimi.dsi.mg4j.document.SimpleCompressedDocumentCollectionBuilder;
import it.unimi.dsi.mg4j.document.StringArrayDocumentCollection;
import it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder;
import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;
import it.unimi.dsi.mg4j.index.CompressionFlags;
import it.unimi.dsi.mg4j.index.DiskBasedIndex;
import it.unimi.dsi.mg4j.index.DowncaseTermProcessor;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import it.unimi.dsi.mg4j.index.IndexReader;
import it.unimi.dsi.mg4j.index.TermProcessor;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.mg4j.index.cluster.DocumentalPartitioningStrategy;
import it.unimi.dsi.mg4j.index.cluster.DocumentalStrategies;
import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
import it.unimi.dsi.mg4j.index.cluster.LexicalPartitioningStrategy;
import it.unimi.dsi.mg4j.index.cluster.LexicalStrategies;
import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
import it.unimi.dsi.sux4j.mph.MWHCFunction;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.Map;

import junit.framework.TestCase;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.log4j.Level;

public class IndexTest extends TestCase {
  static {
    Util.ensureLog4JIsConfigured( Level.INFO );
  }

  private static StringMap<? extends CharSequence> createMap( String basename ) throws IOException {
    FileLinesCollection flc = new FileLinesCollection( basename, "UTF-8" );
    return new ShiftAddXorSignedStringMap( flc.iterator(), new MWHCFunction<CharSequence>( flc, TransformationStrategies.utf16() ) );
  }
 
 
  private String basename;

  private final int NUMBER_OF_DOCUMENTS = 100;

  private final int[] INTEGER_DOCUMENT = new int[ NUMBER_OF_DOCUMENTS ];

  private final Date[] DATE_DOCUMENT = new Date[ NUMBER_OF_DOCUMENTS ];

  @SuppressWarnings("unchecked")
  private final Int2ObjectMap<String>[] VIRTUAL_DOCUMENT = new Int2ObjectMap[ NUMBER_OF_DOCUMENTS ];
  {
    for ( int i = INTEGER_DOCUMENT.length; i-- != 0; )
      INTEGER_DOCUMENT[ i ] = i;
    for ( int i = DATE_DOCUMENT.length; i-- != 0; )
      DATE_DOCUMENT[ i ] = new Date( i * 86400000L );
    for ( int i = VIRTUAL_DOCUMENT.length; i-- != 0; ) {
      VIRTUAL_DOCUMENT[ i ] = new Int2ObjectArrayMap<String>();
      VIRTUAL_DOCUMENT[ i ].put( i - 1, "link _ to previous document link" );
      VIRTUAL_DOCUMENT[ i ].put( i, "link to this document link" );
      VIRTUAL_DOCUMENT[ i ].put( i + 1, "link to next document link" );
    }
  }

  private final VirtualDocumentResolver RESOLVER = new MapVirtualDocumentCollection.TrivialVirtualDocumentResolver( NUMBER_OF_DOCUMENTS );

  private static Reference2ObjectOpenHashMap<Component, Coding> defaultStandardIndex() {
    return new Reference2ObjectOpenHashMap<Component, Coding>( CompressionFlags.DEFAULT_STANDARD_INDEX );
  }
 
  public final static TermProcessor KILL_A_PROCESSOR = KillATermProcessor.getInstance();

  public final static class KillATermProcessor implements TermProcessor {
    private static final long serialVersionUID = 1L;

    private static final KillATermProcessor INSTANCE = new KillATermProcessor();

    public TermProcessor copy() {
      return this;
    }

    public static TermProcessor getInstance() {
      return INSTANCE;
    }

    public boolean processPrefix( MutableString prefix ) {
      return true;
    }

    public boolean processTerm( MutableString term ) {
      return term.indexOf( 'a' ) == -1;
    }
  };

  final static int[] INDEXED_FIELD = { 0, 1, 2 };

  /**
   * Checks that the two provided indices are byte-by-byte the same, and that property files
   * coincide except for the provided property keys.
   *
   * @param basename0 the basename of an index.
   * @param basename1 the basename of an index.
   * @param excludedProperty a list of property keys that will not be considered when evaluating
   * the equality of property fields.
   */
  private void sameIndex( final String basename0, final String basename1, final String... excludedProperty ) throws IOException, ConfigurationException {
    // The two indices must be byte-by-byte identical in all components
    for ( String ext : new String[] { INDEX_EXTENSION, OFFSETS_EXTENSION, TERMS_EXTENSION, SIZES_EXTENSION, FREQUENCIES_EXTENSION, GLOBCOUNTS_EXTENSION } ) {
      File f0 = new File( basename0 + ext );
      File f1 = new File( basename1 + ext );
      assertEquals( ext, f0.exists(), f1.exists() );
      if ( ext != SIZES_EXTENSION && f0.exists() ) assertTrue( ext, IOUtils.contentEquals( new FileInputStream( f0 ), new FileInputStream( f1 ) ) );
    }

    Properties properties0 = new Properties( basename0 + PROPERTIES_EXTENSION );
    Properties properties1 = new Properties( basename1 + PROPERTIES_EXTENSION );
    for ( String p : excludedProperty ) {
      properties0.setProperty( p, null );
      properties1.setProperty( p, null );
    }

    assertEquals( properties0, properties1 );
  }

  public void sameContent( CharSequence basename0, CharSequence basename1, FileLinesIterator terms ) throws ConfigurationException, SecurityException, IOException, URISyntaxException,
      ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    sameContent( it.unimi.dsi.mg4j.index.Index.getInstance( basename0 ), it.unimi.dsi.mg4j.index.Index.getInstance( basename1 ), terms );
  }

  public void sameContent( CharSequence basename0, CharSequence basename1 ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException,
      InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    sameContent( basename0, basename1, null );
  }

  public void sameContent( it.unimi.dsi.mg4j.index.Index index0, it.unimi.dsi.mg4j.index.Index index1 ) throws IOException {
    sameContent( index0, index1, null );
  }

  public void sameContent( it.unimi.dsi.mg4j.index.Index index0, it.unimi.dsi.mg4j.index.Index index1, final FileLinesIterator terms ) throws IOException {
    assertEquals( index0.hasCounts, index1.hasCounts );
    assertEquals( index0.hasPositions, index1.hasPositions );
    assertEquals( index0.hasPayloads, index1.hasPayloads );
    assertEquals( index0.numberOfTerms, index1.numberOfTerms );
    assertEquals( index0.numberOfDocuments, index1.numberOfDocuments );

    final int numTerms = index0.numberOfTerms;
    int document;
    int[] p0 = IntArrays.EMPTY_ARRAY, p1 = IntArrays.EMPTY_ARRAY;
    boolean hasCounts = index0.hasCounts, hasPositions = index0.hasPositions;
    final IndexReader reader0 = index0.getReader(), reader1 = index1.getReader();
    IndexIterator i0, i1;
    for ( int i = 0; i < numTerms; i++ ) {
      if ( terms != null ) {
        final CharSequence term = terms.next();
        i0 = reader0.documents( term );
        i1 = reader1.documents( term );
      }
      else {
        i0 = reader0.documents( i );
        i1 = reader1.documents( i );
      }

      while ( i0.hasNext() && i1.hasNext() ) {
        assertEquals( "term " + i, document = i0.nextDocument(), i1.nextDocument() );
        if ( hasCounts ) {
          assertEquals( "term " + i + ", document " + document, i0.count(), i1.count() );
          if ( i0.count() > p0.length ) p0 = new int[ i0.count() ];
          if ( i1.count() > p1.length ) p1 = new int[ i1.count() ];
          if ( hasPositions ) for ( int p = i0.count(); p-- != 0; )
            assertEquals( "term " + i + ", document " + document + ", position " + p, p0[ p ], p1[ p ] );
        }
      }

      assertEquals( "term " + i, i0.hasNext(), i1.hasNext() );
    }
    reader0.close();
    reader1.close();
  }


  public int processDocument( WordReader wordReader, int documentIndex, int startPos, Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>> termMap, TermProcessor termProcessor )
      throws IOException {
    assertTrue( documentIndex >= 0 );
    Object2ObjectOpenHashMap<MutableString, IntArrayList> terms = new Object2ObjectOpenHashMap<MutableString, IntArrayList>();
    MutableString word = new MutableString(), nonWord = new MutableString();

    int pos = startPos;
    while ( wordReader.next( word, nonWord ) ) {
      if ( word.length() == 0 ) continue;
      if ( !termProcessor.processTerm( word ) ) {
        pos++;
        continue;
      }
      IntArrayList positions = terms.get( word );
      if ( positions == null ) terms.put( word.copy(), positions = new IntArrayList() );
      positions.add( pos++ );
    }

    for ( MutableString term : terms.keySet() ) {
      ObjectArrayList<int[]> list = termMap.get( term );
      IntArrayList positions = terms.get( term );
      if ( list == null ) termMap.put( term, list = new ObjectArrayList<int[]>() );

      int[] t = new int[ positions.size() + 1 ];
      t[ 0 ] = documentIndex;
      System.arraycopy( positions.elements(), 0, t, 1, positions.size() );
      list.add( t );
    }

    return pos;
  }

  /**
   * Checks that the fields indexed by the given indices have been indexed correctly by performing
   * a mock index construction over the given sequence.
   *
   * @param sequence a document sequence.
   * @param resolver the virtual document resolver used to index the collection (we assume the
   * same for all virtual fields), or <code>null</code>.
   * @param gap the virtual document gap (we assume the same for all virtual fields; it is
   * immaterial if no field is virtual).
   * @param index a list of indices that have indexed one or more fields of <code>sequence</code>.
   */
  @SuppressWarnings("unchecked")
  public void checkAgainstContent( DocumentSequence sequence, int[] map, VirtualDocumentResolver resolver, int gap, Index... index ) throws IOException {
    DocumentIterator iterator = sequence.iterator();
    DocumentFactory factory = sequence.factory();
    Document document;
    final int n = index.length;
    final int[] field = new int[ n ];
    final int[][] currMaxPos = new int[ n ][];
    final int[] maxDoc = new int[ n ];
    IntArrays.fill( maxDoc, -1 );
    final Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>[] termMap = new Object2ObjectOpenHashMap[ n ];
    final IntArrayList[] payloadPointers = new IntArrayList[ n ];
    final ObjectArrayList<Object>[] payloadContent = new ObjectArrayList[ n ];

    for ( int i = 0; i < n; i++ ) {
      field[ i ] = factory.fieldIndex( index[ i ].field );
      switch ( factory.fieldType( field[ i ] ) ) {
      case VIRTUAL:
        currMaxPos[ i ] = new int[ resolver.numberOfDocuments() ];
      case TEXT:
        termMap[ i ] = new Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>();
        break;
      case DATE:
      case INT:
        payloadPointers[ i ] = new IntArrayList();
        payloadContent[ i ] = new ObjectArrayList<Object>();
      }
    }

    int documentIndex = 0;

    while ( ( document = iterator.nextDocument() ) != null ) {
      for ( int i = 0; i < field.length; i++ ) {
        switch ( factory.fieldType( field[ i ] ) ) {
        case TEXT:
          processDocument( document.wordReader( field[ i ] ).setReader( (Reader)document.content( field[ i ] ) ), map == null ? documentIndex : map[ documentIndex ], 0, termMap[ i ],
              index[ i ].termProcessor );
          break;
        case VIRTUAL:
          ObjectArrayList<VirtualDocumentFragment> fragments = (ObjectArrayList<VirtualDocumentFragment>)document.content( field[ i ] );
          resolver.context( document );
          for ( VirtualDocumentFragment fragment : fragments ) {
            int d = resolver.resolve( fragment.documentSpecifier() );

            if ( d != -1 ) {
              if ( map != null ) d = map[ d ];
              if ( maxDoc[ i ] < d ) maxDoc[ i ] = d;
              currMaxPos[ i ][ d ] = processDocument( document.wordReader( field[ i ] ).setReader( new FastBufferedReader( fragment.text() ) ), d, currMaxPos[ i ][ d ], termMap[ i ],
                  index[ i ].termProcessor )
                  + gap;
            }
          }
          break;
        case INT:
        case DATE:
          Object x = document.content( field[ i ] );
          if ( x != null ) {
            payloadPointers[ i ].add( map == null ? documentIndex : map[ documentIndex ] );
            payloadContent[ i ].add( x );
          }
        default:
        }
      }
      document.close();
      documentIndex++;
    }

    iterator.close();

    for ( int i = 0; i < n; i++ ) {
      if ( termMap[ i ] != null ) for ( ObjectArrayList<int[]> list : termMap[ i ].values() ) {
        // We sort in all cases, just to reduce the possible execution paths
        Collections.sort( list, new Comparator<int[]>() {
          public int compare( int[] p0, int[] p1 ) {
            return p0[ 0 ] - p1[ 0 ];
          }
        } );

        switch ( factory.fieldType( field[ i ] ) ) {
        case VIRTUAL:
          // We coalesce the list
          ObjectArrayList<int[]> newList = new ObjectArrayList<int[]>();
          for ( int k = 0; k < list.size(); ) {
            int s;
            for ( s = k + 1; s < list.size(); s++ )
              if ( list.get( k )[ 0 ] != list.get( s )[ 0 ] ) break;
            int count = 0;
            for ( int t = k; t < s; t++ )
              count += list.get( t ).length - 1;
            int[] posting = new int[ count + 1 ];
            posting[ 0 ] = list.get( k )[ 0 ];
            count = 1;
            for ( int t = k; t < s; t++ ) {
              System.arraycopy( list.get( t ), 1, posting, count, list.get( t ).length - 1 );
              count += list.get( t ).length - 1;
            }
            k = s;
            newList.add( posting );
          }
          list.clear();
          list.addAll( newList );
          break;
        default:
        }
      }
      if ( payloadPointers[ i ] != null ) {
        final int p[] = payloadPointers[ i ].elements();
        final Object[] b = payloadContent[ i ].elements();
        Arrays.quickSort( 0, payloadPointers[ i ].size(), new AbstractIntComparator() {
          public int compare( int i0, int i1 ) {
            return p[ i0 ] - p[ i1 ];
          }
        }, new Swapper() {
          public void swap( int i0, int i1 ) {
            final int t = p[ i0 ];
            p[ i0 ] = p[ i1 ];
            p[ i1 ] = t;
            final Object o = b[ i0 ];
            b[ i0 ] = b[ i1 ];
            b[ i1 ] = o;
          }
        } );
      }
    }


    for ( int i = 0; i < n; i++ ) {
      assertEquals( index[ i ].toString(), factory.fieldType( field[ i ] ) == FieldType.VIRTUAL ? maxDoc[ i ] + 1 : documentIndex, index[ i ].numberOfDocuments );
      switch ( factory.fieldType( field[ i ] ) ) {
      case TEXT:
      case VIRTUAL:
        assertEquals( termMap[ i ].size(), index[ i ].numberOfTerms );
        int postings = 0,
        occurrences = 0;
        for ( ObjectArrayList<int[]> l : termMap[ i ].values() ) {
          postings += l.size();
          for ( int[] p : l )
            occurrences += p.length - 1;
        }
        assertEquals( index[ i ].toString(), postings, index[ i ].numberOfPostings );
        assertEquals( occurrences, index[ i ].numberOfOccurrences );
        IndexReader indexReader = index[ i ].getReader();
        for ( MutableString term : new ObjectRBTreeSet<MutableString>( termMap[ i ].keySet() ).toArray( new MutableString[ termMap[ i ].size() ] ) ) {
          String msg = index[ i ] + ":" + term;
          IndexIterator indexIterator = indexReader.documents( term );
          ObjectArrayList<int[]> list = termMap[ i ].get( term );
          int k = 0;
          while ( indexIterator.hasNext() ) {
            assertEquals( msg, list.get( k )[ 0 ], indexIterator.nextDocument() ); // Document
                                                // pointer
            if ( index[ i ].hasCounts ) assertEquals( msg, list.get( k ).length - 1, indexIterator.count() ); // Count
            if ( index[ i ].hasPositions ) {
              final int[] position = indexIterator.positionArray();
              for ( int p = 0; p < indexIterator.count(); p++ )
                assertEquals( msg, list.get( k )[ p + 1 ], position[ p ] ); // Positions
            }
            k++;
          }
          assertEquals( k, list.size() ); // This implicitly checks the frequency
        }
        indexReader.close();
        break;
      case INT:
      case DATE:
        assertEquals( index[ i ].toString(), payloadPointers[ i ].size(), index[ i ].numberOfPostings );
        assertEquals( index[ i ].toString(), documentIndex != 0 ? 1 : 0, index[ i ].numberOfTerms );
        assertEquals( index[ i ].toString(), -1, index[ i ].numberOfOccurrences );
        if ( documentIndex != 0 ) {
          IndexIterator indexIterator = index[ i ].documents( 0 );
          int k = 0;
          while ( indexIterator.hasNext() ) {
            assertEquals( payloadPointers[ i ].getInt( k ), indexIterator.nextDocument() );
            if ( factory.fieldType( field[ i ] ) == FieldType.INT ) assertEquals( ( (Number)payloadContent[ i ].get( k ) ).longValue(), ( (Number)indexIterator.payload().get() )
                .longValue() );
            else assertEquals( payloadContent[ i ].get( k ), indexIterator.payload().get() );
            k++;
          }
          indexIterator.dispose();
          assertEquals( k, payloadContent[ i ].size() );
        }
      }
    }
  }

  public void setUp() throws IOException {
    basename = File.createTempFile( this.getClass().getSimpleName(), "test" ).getCanonicalPath();
  }

  public void tearDown() throws IOException {
    for ( Object f : FileUtils.listFiles( new File( basename ).getParentFile(), FileFilterUtils.prefixFileFilter( this.getClass().getSimpleName() ), null ) )
      ( (File)f ).delete();
    if ( lastSequence != null ) lastSequence.close();
  }

  // We keep track of the last returned sequence to close it without cluttering the test code
  private DocumentSequence lastSequence;

  public DocumentSequence getSequence() throws ConfigurationException, IOException {
    if ( lastSequence != null ) lastSequence.close();
    return lastSequence = new CompositeDocumentSequence( new InputStreamDocumentSequence( this.getClass().getResourceAsStream( "documents.data" ), 10, new IdentityDocumentFactory(
        new String[] { "encoding=UTF-8" } ), NUMBER_OF_DOCUMENTS ), new IntArrayDocumentCollection( INTEGER_DOCUMENT ), new DateArrayDocumentCollection( DATE_DOCUMENT ),
        new MapVirtualDocumentCollection( VIRTUAL_DOCUMENT ) );
  }

  @SuppressWarnings("unchecked")
  public DocumentSequence getEmptySequence() throws ConfigurationException, IOException {
    if ( lastSequence != null ) lastSequence.close();
    return lastSequence = new CompositeDocumentSequence( new StringArrayDocumentCollection(), new IntArrayDocumentCollection(), new DateArrayDocumentCollection(),
        new MapVirtualDocumentCollection() );
  }

  public void testIndex( boolean interleaved, Map<Component, Coding> flags, int quantum, int height, TermProcessor termProcessor ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException,
      InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {

    // Vanilla indexing
    new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
        .height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();
    checkAgainstContent( getSequence(), null, RESOLVER, Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP, Index.getInstance( basename + "-text" ), Index.getInstance( basename + "-int" ), Index
        .getInstance( basename + "-date" ), Index.getInstance( basename + "-virtual" ) );

    final String basenameZipped = basename + "-zipped";
    if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
    // Vanilla indexing generating a zipped collection (we also use Golomb coding to test the usage of sizes in combinations).
    ZipDocumentCollectionBuilder zipBuilder = new ZipDocumentCollectionBuilder( basenameZipped, getSequence().factory(), true );
    new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
        .height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).builder( zipBuilder ).run();
    // Vanilla indexing using the zipped collection
    new IndexBuilder( basenameZipped, AbstractDocumentSequence.load( basenameZipped + DocumentCollection.DEFAULT_EXTENSION ) ).standardWriterFlags( flags ).termProcessor( termProcessor ).indexedFields( 0, 1, 2, 3 ).skipBufferSize( 1024 )
        .pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();

    // The two indices must be byte-by-byte identical (and we keep the zipped index for future
    // reference)
    sameIndex( basename + "-text", basenameZipped + "-text" );
    sameIndex( basename + "-int", basenameZipped + "-int", "batches" );
    sameIndex( basename + "-date", basenameZipped + "-date", "batches" );
    sameIndex( basename + "-virtual", basenameZipped + "-virtual", "batches" );

    final String basenameSimple = basename + "-simple";

    // Vanilla indexing generating a simple compressed collection
    SimpleCompressedDocumentCollectionBuilder simpleBuilder = new SimpleCompressedDocumentCollectionBuilder( basenameSimple, getSequence().factory(), true );
    new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
        .height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).builder( simpleBuilder ).run();
    // Vanilla indexing using the simple compressed collection
    new IndexBuilder( basenameSimple, AbstractDocumentSequence.load( basenameSimple + DocumentCollection.DEFAULT_EXTENSION ) ).standardWriterFlags( flags ).termProcessor( termProcessor ).indexedFields( 0, 1, 2, 3 ).skipBufferSize( 1024 )
        .pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();

    // The two indices must be byte-by-byte identical (and we keep the zipped index for future
    // reference)
    sameIndex( basename + "-text", basenameSimple + "-text" );
    sameIndex( basename + "-int", basenameSimple + "-int", "batches" );
    sameIndex( basename + "-date", basenameSimple + "-date", "batches" );
    sameIndex( basename + "-virtual", basenameSimple + "-virtual", "batches" );


    // Indexing with just one batch
    new IndexBuilder( basename + "-onebatch", getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 )
        .quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( NUMBER_OF_DOCUMENTS ).run();

    if ( quantum >= 0 ) {
      // The two indices must be byte-by-byte identical
      sameIndex( basename + "-text", basename + "-onebatch-text", "batches" );
      sameIndex( basename + "-int", basename + "-onebatch-int", "batches" );
      sameIndex( basename + "-date", basename + "-onebatch-date", "batches" );
      sameIndex( basename + "-virtual", basename + "-onebatch-virtual", "batches" );
    }
    else {
      // The two indices must have the same content, as a different division
      // in batches can lead to a different quantum estimate.
      sameContent( basename + "-text", basename + "-onebatch-text" );
      sameContent( basename + "-int", basename + "-onebatch-int" );
      sameContent( basename + "-date", basename + "-onebatch-date" );
      sameContent( basename + "-virtual", basename + "-onebatch-virtual" );
    }
  }

  public void testIndex( boolean interleaved, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException,
      IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    testIndex( interleaved, defaultStandardIndex(), quantum, height, DowncaseTermProcessor.getInstance() );
  }

  public void testIndex( boolean interleaved, Map<Component, Coding> flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    testIndex( interleaved, flags, quantum, height, DowncaseTermProcessor.getInstance() );
  }
 
  public void testIndex() throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException,
      InvocationTargetException, NoSuchMethodException {

    final Reference2ObjectOpenHashMap<Component, Coding> flags = new Reference2ObjectOpenHashMap<Component, Coding>( defaultStandardIndex() );
    flags.remove( Component.POSITIONS );
    testIndex( true, flags, 4, 4 );
    testIndex( true, flags, -4, 4 );
    flags.remove( Component.COUNTS );
    testIndex( true, flags, 4, 4 );
    testIndex( true, flags, -4, 4 );

   
    testIndex( true, 0, 0 );
    testIndex( true, defaultStandardIndex(), 0, 0, KILL_A_PROCESSOR );
    testIndex( true, 1, 1 );
    testIndex( true, 1, 2 );
    testIndex( true, 4, 1 );
    testIndex( true, 4, 4 );
    testIndex( true, 8, 1 );
    testIndex( true, 8, 4 );
    testIndex( true, -1, 1 );
    testIndex( true, -1, 2 );
    testIndex( true, -4, 1 );
    testIndex( true, -4, 4 );
    testIndex( true, -8, 1 );
    testIndex( true, -8, 4 );

    testIndex( false, 1, 0 );
    testIndex( false, defaultStandardIndex(), 1, 0, KILL_A_PROCESSOR );
    testIndex( false, 1, 1 );
    testIndex( false, 1, 2 );
    testIndex( false, 4, 1 );
    testIndex( false, 4, 4 );
    testIndex( false, 8, 1 );
    testIndex( false, 8, 4 );
    testIndex( false, -1, 1 );
    testIndex( false, -1, 2 );
    testIndex( false, -4, 1 );
    testIndex( false, -4, 4 );
    testIndex( false, -8, 1 );
    testIndex( false, -8, 4 );
  }

  public void testRemappedIndex( boolean interleaved, Map<Component, Coding> flags, int quantum, int height, TermProcessor termProcessor ) throws IOException, ConfigurationException, SecurityException, URISyntaxException, ClassNotFoundException,
      InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {

    final String basenameMapped = basename + "-map";
    int[] map = IntIterators.unwrap( BinIO.asIntIterator( new DataInputStream( this.getClass().getResourceAsStream( "documents.permutation.data" ) ) ) );
    String mapFile = File.createTempFile( this.getClass().getSimpleName(), "map" ).toString();
    BinIO.storeInts( map, mapFile );

    // Remapped index
    new IndexBuilder( basenameMapped, getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum(
        quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).mapFile( mapFile ).run();
    checkAgainstContent( getSequence(), map, RESOLVER, Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP, Index.getInstance( basenameMapped + "-text" ), Index.getInstance( basenameMapped + "-int" ), Index
        .getInstance( basenameMapped + "-date" ), Index.getInstance( basenameMapped + "-virtual" ) );

    // Remapped index, one batch
    new IndexBuilder( basenameMapped + "-onebatch", getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 )
        .quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( NUMBER_OF_DOCUMENTS ).mapFile( mapFile ).run();

    if ( quantum >= 0 ) {
      // The two indices must be byte-by-byte identical
      sameIndex( basenameMapped + "-text", basenameMapped + "-onebatch-text", "batches" );
      sameIndex( basenameMapped + "-int", basenameMapped + "-onebatch-int", "batches" );
      sameIndex( basenameMapped + "-date", basenameMapped + "-onebatch-date", "batches" );
      sameIndex( basenameMapped + "-virtual", basenameMapped + "-onebatch-virtual", "batches" );
    }
    else {
      // The two indices must have the same content, as a different division
      // in batches can lead to a different quantum estimate.
      sameContent( basenameMapped + "-text", basenameMapped + "-onebatch-text" );
      sameContent( basenameMapped + "-int", basenameMapped + "-onebatch-int" );
      sameContent( basenameMapped + "-date", basenameMapped + "-onebatch-date" );
      sameContent( basenameMapped + "-virtual", basenameMapped + "-onebatch-virtual" );
    }
  }

  public void testRemappedIndex( boolean interleaved, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException,
      IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    testRemappedIndex( interleaved, defaultStandardIndex(), quantum, height, DowncaseTermProcessor.getInstance() );
  }
  public void testRemappedIndex( boolean interleaved, Map<Component, Coding> flags, int quantum, int height ) throws IOException, ConfigurationException, SecurityException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    testRemappedIndex( interleaved, flags, quantum, height, DowncaseTermProcessor.getInstance() );
  }
 
  public void testRemappedIndex() throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException,
      InvocationTargetException, NoSuchMethodException {

    final Reference2ObjectOpenHashMap<Component, Coding> flags = new Reference2ObjectOpenHashMap<Component, Coding>( defaultStandardIndex() );
    flags.remove( Component.POSITIONS );
    testRemappedIndex( true, flags, 4, 4 );
    testRemappedIndex( true, flags, -4, 4 );
    flags.remove( Component.COUNTS );
    testRemappedIndex( true, flags, 4, 4 );
    testRemappedIndex( true, flags, -4, 4 );

   
    testRemappedIndex( true, 0, 0 );
    testRemappedIndex( true, defaultStandardIndex(), 0, 0, KILL_A_PROCESSOR );
    testRemappedIndex( true, 1, 1 );
    testRemappedIndex( true, 1, 2 );
    testRemappedIndex( true, 4, 1 );
    testRemappedIndex( true, 4, 4 );
    testRemappedIndex( true, 8, 1 );
    testRemappedIndex( true, 8, 4 );
    testRemappedIndex( true, -1, 1 );
    testRemappedIndex( true, -1, 2 );
    testRemappedIndex( true, -4, 1 );
    testRemappedIndex( true, -4, 4 );
    testRemappedIndex( true, -8, 1 );
    testRemappedIndex( true, -8, 4 );

    testRemappedIndex( false, 1, 0 );
    testRemappedIndex( false, defaultStandardIndex(), 1, 0, KILL_A_PROCESSOR );
    testRemappedIndex( false, 1, 1 );
    testRemappedIndex( false, 1, 2 );
    testRemappedIndex( false, 4, 1 );
    testRemappedIndex( false, 4, 4 );
    testRemappedIndex( false, 8, 1 );
    testRemappedIndex( false, 8, 4 );
    testRemappedIndex( false, -1, 1 );
    testRemappedIndex( false, -1, 2 );
    testRemappedIndex( false, -4, 1 );
    testRemappedIndex( false, -4, 4 );
    testRemappedIndex( false, -8, 1 );
    testRemappedIndex( false, -8, 4 );
  }

  public void testPartitionConcatenate( boolean interleaved, Map<Component, Coding> flags, int quantum, int height ) throws Exception {
    // Vanilla indexing
    if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
    new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
        .virtualDocumentResolver( 3, RESOLVER ).run();

    // We partition
    BinIO.storeObject( DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy" );

    new PartitionDocumentally( basename + "-text", basename + "-text-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, flags,
        interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    new PartitionDocumentally( basename + "-int", basename + "-int-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX,
        interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    new PartitionDocumentally( basename + "-date", basename + "-date-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX,
        interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    new PartitionDocumentally( basename + "-virtual", basename + "-virtual-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, flags,
        interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();

    // For the text part, we need term maps to call sameIndex()
    String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    for ( String index : localIndex ) BinIO.storeObject( createMap(index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );

    sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );

    sameContent( basename + "-int", basename + "-int-part" );
    sameContent( basename + "-date", basename + "-date-part" );

    localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    for ( String index : localIndex )
      BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );

    sameContent( basename + "-virtual", basename + "-virtual-part", new FileLinesCollection( basename + "-virtual" + TERMS_EXTENSION, "UTF-8" ).iterator() );

    localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    new Concatenate( basename + "-text-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
    sameContent( basename + "-text", basename + "-text-merged" );

    localIndex = new Properties( basename + "-int-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    new Concatenate( basename + "-int-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
    sameContent( basename + "-text", basename + "-text-merged" );

    localIndex = new Properties( basename + "-date-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    new Concatenate( basename + "-date-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
    sameContent( basename + "-text", basename + "-text-merged" );

    localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    new Concatenate( basename + "-virtual-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
    sameContent( basename + "-text", basename + "-text-merged" );
  }

  public void testPartitionConcatenate() throws Exception {

    final Reference2ObjectOpenHashMap<Component, Coding> flags = new Reference2ObjectOpenHashMap<Component, Coding>( defaultStandardIndex() );
    flags.remove( Component.POSITIONS );
    testPartitionConcatenate( true, flags, 4, 4 );
    testPartitionConcatenate( true, flags, -4, 4 );
    flags.remove( Component.COUNTS );
    testPartitionConcatenate( true, flags, 4, 4 );
    testPartitionConcatenate( true, flags, -4, 4 );

    testPartitionConcatenate( true, defaultStandardIndex(), 0, 0 );
    testPartitionConcatenate( true, defaultStandardIndex(), 1, 1 );
    testPartitionConcatenate( true, defaultStandardIndex(), 1, 2 );
    testPartitionConcatenate( true, defaultStandardIndex(), 4, 1 );
    testPartitionConcatenate( true, defaultStandardIndex(), 4, 4 );
    testPartitionConcatenate( true, defaultStandardIndex(), 8, 1 );
    testPartitionConcatenate( true, defaultStandardIndex(), 8, 4 );
    testPartitionConcatenate( true, defaultStandardIndex(), -1, 1 );
    testPartitionConcatenate( true, defaultStandardIndex(), -1, 2 );
    testPartitionConcatenate( true, defaultStandardIndex(), -4, 1 );
    testPartitionConcatenate( true, defaultStandardIndex(), -4, 4 );
    testPartitionConcatenate( true, defaultStandardIndex(), -8, 1 );
    testPartitionConcatenate( true, defaultStandardIndex(), -8, 4 );

    testPartitionConcatenate( false, defaultStandardIndex(), 1, 0 );
    testPartitionConcatenate( false, defaultStandardIndex(), 1, 1 );
    testPartitionConcatenate( false, defaultStandardIndex(), 1, 2 );
    testPartitionConcatenate( false, defaultStandardIndex(), 4, 1 );
    testPartitionConcatenate( false, defaultStandardIndex(), 4, 4 );
    testPartitionConcatenate( false, defaultStandardIndex(), 8, 1 );
    testPartitionConcatenate( false, defaultStandardIndex(), 8, 4 );
    testPartitionConcatenate( false, defaultStandardIndex(), -1, 1 );
    testPartitionConcatenate( false, defaultStandardIndex(), -1, 2 );
    testPartitionConcatenate( false, defaultStandardIndex(), -4, 1 );
    testPartitionConcatenate( false, defaultStandardIndex(), -4, 4 );
    testPartitionConcatenate( false, defaultStandardIndex(), -8, 1 );
    testPartitionConcatenate( false, defaultStandardIndex(), -8, 4 );
  }


  public void testPartitionMerge( boolean interleaved, Map<Component, Coding> flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
      Exception {
   
    if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
   
    // Vanilla indexing
    new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
        .virtualDocumentResolver( 3, RESOLVER ).run();

    // Now we use a crazy strategy moving around documents using modular arithmetic
    final DocumentalPartitioningStrategy modulo3 = new Modulo3DocumentalClusteringStrategy( NUMBER_OF_DOCUMENTS );
    BinIO.storeObject( modulo3, basename + "-strategy" );

    new PartitionDocumentally( basename + "-text", basename + "-text-part", modulo3, basename + "-strategy", 0, 1024, flags, interleaved, quantum != 0, Math.abs( quantum ), height,
        1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    new PartitionDocumentally( basename + "-int", basename + "-int-part", modulo3, basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024,
        DEFAULT_LOG_INTERVAL ).run();
    new PartitionDocumentally( basename + "-date", basename + "-date-part", modulo3, basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, Math.abs( quantum ), height,
        1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    new PartitionDocumentally( basename + "-virtual", basename + "-virtual-part", modulo3, basename + "-strategy", 0, 1024, flags, interleaved, quantum != 0, Math.abs( quantum ), height,
        1024 * 1024, DEFAULT_LOG_INTERVAL ).run();

    String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    for ( String index : localIndex )
      BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
    sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );

    sameContent( basename + "-int", basename + "-int-part" );
    sameContent( basename + "-date", basename + "-date-part" );

    localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    for ( String index : localIndex )
      BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
    sameContent( basename + "-virtual", basename + "-virtual-part", new FileLinesCollection( basename + "-virtual" + TERMS_EXTENSION, "UTF-8" ).iterator() );

    localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );

    new Merge( basename + "-text-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches" );
    else sameContent( basename + "-text", basename + "-text-merged" );
    localIndex = new Properties( basename + "-int-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    new Merge( basename + "-int-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-int", basename + "-int-merged", "batches" );
    else sameContent( basename + "-int", basename + "-int-merged" );
    localIndex = new Properties( basename + "-date-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    new Merge( basename + "-date-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-date", basename + "-date-merged", "batches" );
    else sameContent( basename + "-date", basename + "-date-merged" );
    localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    new Merge( basename + "-virtual-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
    if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-virtual", basename + "-virtual-merged", "batches" );
    else sameContent( basename + "-virtual", basename + "-virtual-merged" );
  }

  public void testPartitionMerge() throws Exception {
    final Reference2ObjectOpenHashMap<Component, Coding> flags = new Reference2ObjectOpenHashMap<Component, Coding>( defaultStandardIndex() );
    flags.remove( Component.POSITIONS );
    testPartitionMerge( true, flags, 4, 4 );
    testPartitionMerge( true, flags, -4, 4 );
    flags.remove( Component.COUNTS );
    testPartitionMerge( true, flags, 4, 4 );
    testPartitionMerge( true, flags, -4, 4 );
   
    testPartitionMerge( true, defaultStandardIndex(), 0, 0 );
    testPartitionMerge( true, defaultStandardIndex(), 1, 1 );
    testPartitionMerge( true, defaultStandardIndex(), 1, 2 );
    testPartitionMerge( true, defaultStandardIndex(), 4, 1 );
    testPartitionMerge( true, defaultStandardIndex(), 4, 4 );
    testPartitionMerge( true, defaultStandardIndex(), 8, 1 );
    testPartitionMerge( true, defaultStandardIndex(), 8, 4 );
    testPartitionMerge( true, defaultStandardIndex(), -1, 1 );
    testPartitionMerge( true, defaultStandardIndex(), -1, 2 );
    testPartitionMerge( true, defaultStandardIndex(), -4, 1 );
    testPartitionMerge( true, defaultStandardIndex(), -4, 4 );
    testPartitionMerge( true, defaultStandardIndex(), -8, 1 );
    testPartitionMerge( true, defaultStandardIndex(), -8, 4 );

    testPartitionMerge( false, defaultStandardIndex(), 1, 0 );
    testPartitionMerge( false, defaultStandardIndex(), 1, 1 );
    testPartitionMerge( false, defaultStandardIndex(), 1, 2 );
    testPartitionMerge( false, defaultStandardIndex(), 4, 1 );
    testPartitionMerge( false, defaultStandardIndex(), 4, 4 );
    testPartitionMerge( false, defaultStandardIndex(), 8, 1 );
    testPartitionMerge( false, defaultStandardIndex(), 8, 4 );
    testPartitionMerge( false, defaultStandardIndex(), -1, 1 );
    testPartitionMerge( false, defaultStandardIndex(), -1, 2 );
    testPartitionMerge( false, defaultStandardIndex(), -4, 1 );
    testPartitionMerge( false, defaultStandardIndex(), -4, 4 );
    testPartitionMerge( false, defaultStandardIndex(), -8, 1 );
    testPartitionMerge( false, defaultStandardIndex(), -8, 4 );
  }

  public void testLexicalPartitioning( boolean interleaved, Map<Component, Coding> flags ) throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
      Exception {
    // Vanilla indexing
    new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).interleaved( interleaved ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).virtualDocumentResolver( 3, RESOLVER ).run();

    // Now we use a crazy strategy moving around documents using modular arithmetic
    final LexicalPartitioningStrategy uniform = LexicalStrategies.uniform( 3, DiskBasedIndex.getInstance( basename + "-text" ) );
    BinIO.storeObject( uniform, basename + "-strategy" );

    new PartitionLexically( basename + "-text", basename + "-text-part", uniform, basename + "-strategy", 1024, DEFAULT_LOG_INTERVAL ).run();
    new PartitionLexically( basename + "-virtual", basename + "-virtual-part", uniform, basename + "-strategy", 1024, DEFAULT_LOG_INTERVAL ).run();

    String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
    for ( String index : localIndex )
      BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
    sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );
    sameContent( basename + "-virtual", basename + "-virtual-part" );
  }

  public void testLexicalPartitioning() throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, Exception {
    testLexicalPartitioning( true, defaultStandardIndex() );
    testLexicalPartitioning( false, defaultStandardIndex() );
    Reference2ObjectOpenHashMap<Component, Coding> flags = new Reference2ObjectOpenHashMap<Component, Coding>( defaultStandardIndex() );
    flags.remove( Component.POSITIONS );
    testLexicalPartitioning( true, flags );
    flags.remove( Component.COUNTS );
    testLexicalPartitioning( true, flags );
  }

  public void testEmpty( boolean interleaved, Map<Component, Coding> flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException,
      IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    // Vanilla indexing
    new IndexBuilder( basename, getEmptySequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
        .virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();
    checkAgainstContent( getEmptySequence(), null, RESOLVER, Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP, Index.getInstance( basename + "-text" ), Index.getInstance( basename + "-int" ), Index
        .getInstance( basename + "-date" ), Index.getInstance( basename + "-virtual" ) );

    // Permuted indexing
    String mapFile = File.createTempFile( this.getClass().getSimpleName(), "permutation" ).toString();
    new IndexBuilder( basename + "-mapped", getEmptySequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
        .virtualDocumentResolver( 3, RESOLVER ).mapFile( mapFile ).documentsPerBatch( 20 ).run();

    sameIndex( basename + "-text", basename + "-mapped-text" );
    sameIndex( basename + "-int", basename + "-mapped-int" );
    sameIndex( basename + "-date", basename + "-mapped-date" );
    sameIndex( basename + "-virtual", basename + "-mapped-virtual" );
  }

  public void testEmpty() throws Exception {
    final Reference2ObjectOpenHashMap<Component, Coding> flags = new Reference2ObjectOpenHashMap<Component, Coding>( defaultStandardIndex() );
    flags.remove( Component.POSITIONS );
    testEmpty( true, flags, 4, 4 );
    testEmpty( true, flags, -4, 4 );
    flags.remove( Component.COUNTS );
    testEmpty( true, flags, 4, 4 );
    testEmpty( true, flags, -4, 4 );


    testEmpty( true, defaultStandardIndex(), 0, 0 );
    testEmpty( true, defaultStandardIndex(), 1, 1 );
    testEmpty( true, defaultStandardIndex(), 1, 2 );
    testEmpty( true, defaultStandardIndex(), 4, 1 );
    testEmpty( true, defaultStandardIndex(), 4, 4 );
    testEmpty( true, defaultStandardIndex(), 8, 1 );
    testEmpty( true, defaultStandardIndex(), 8, 4 );
    testEmpty( true, defaultStandardIndex(), -1, 1 );
    testEmpty( true, defaultStandardIndex(), -1, 2 );
    testEmpty( true, defaultStandardIndex(), -8, 1 );
    testEmpty( true, defaultStandardIndex(), -8, 4 );
    testEmpty( true, defaultStandardIndex(), -8, 1 );
    testEmpty( true, defaultStandardIndex(), -8, 4 );

    testEmpty( false, defaultStandardIndex(), 1, 0 );
    testEmpty( false, defaultStandardIndex(), 1, 1 );
    testEmpty( false, defaultStandardIndex(), 1, 2 );
    testEmpty( false, defaultStandardIndex(), 4, 1 );
    testEmpty( false, defaultStandardIndex(), 4, 4 );
    testEmpty( false, defaultStandardIndex(), 8, 1 );
    testEmpty( false, defaultStandardIndex(), 8, 4 );
    testEmpty( false, defaultStandardIndex(), -1, 1 );
    testEmpty( false, defaultStandardIndex(), -1, 2 );
    testEmpty( false, defaultStandardIndex(), -8, 1 );
    testEmpty( false, defaultStandardIndex(), -8, 4 );
    testEmpty( false, defaultStandardIndex(), -8, 1 );
    testEmpty( false, defaultStandardIndex(), -8, 4 );
  }

  public void testLoadOptions( boolean interleaved, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException,
      IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    // Vanilla indexing
    new IndexBuilder( basename, getSequence() ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
        .virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();
    for ( String options : new String[] { "inmemory=1", "mapped=1", "offsetstep=0", "offsetstep=-2" } )
      checkAgainstContent( getSequence(), null, RESOLVER, Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP, Index.getInstance( basename + "-text?" + options ),
          Index.getInstance( basename + "-int?" + options ), Index.getInstance( basename + "-date?" + options ), Index.getInstance( basename + "-virtual?" + options ) );
  }

  public void testLoadOptions() throws Exception {
    testLoadOptions( true, 0, 0 );
    testLoadOptions( true, 1, 1 );
    testLoadOptions( true, -1, 1 );

    testLoadOptions( false, 1, 0 );
    testLoadOptions( false, 1, 1 );
    testLoadOptions( false, -1, 1 );
  }

}
TOP

Related Classes of it.unimi.dsi.mg4j.tool.IndexTest$KillATermProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.