Source Code of org.apache.lucene.benchmark.byTask.TestPerfTasksLogic$MyMergePolicy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.lucene.benchmark.byTask;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.text.Collator;
import java.util.List;
import java.util.Locale;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util._TestUtil;


/**
 * Test very simply that perf tasks - simple algorithms - are doing what they should.
 */
@SuppressCodecs("Lucene3x")
public class TestPerfTasksLogic extends BenchmarkTestCase {


  @Override
  public void setUp() throws Exception {
    super.setUp();
    copyToWorkDir("reuters.first20.lines.txt");
    copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt");
  }


  /**
   * Test index creation logic
   */
  public void testIndexAndSearchTasks() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 1000",
        "ForceMerge(1)",
        "CloseIndex",
        "OpenReader",
        "{ CountingSearchTest } : 200",
        "CloseReader",
        "[ CountingSearchTest > : 70",
        "[ CountingSearchTest > : 9",
    };
    
    // 2. we test this value later
    CountingSearchTestTask.numSearches = 0;
    
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 4. test specific checks after the benchmark run completed.
    assertEquals("TestSearchTask was supposed to be called!",279,CountingSearchTestTask.numSearches);
    assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
    // now we should be able to open the index for write. 
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
            .setOpenMode(OpenMode.APPEND));
    iw.close();
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
    ir.close();
  }


  /**
   * Test timed sequence task.
   */
  public void testTimedSearchTask() throws Exception {
    String algLines[] = {
        "log.step=100000",
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 100",
        "ForceMerge(1)",
        "CloseIndex",
        "OpenReader",
        "{ CountingSearchTest } : .5s",
        "CloseReader",
    };


    CountingSearchTestTask.numSearches = 0;
    execBenchmark(algLines);
    assertTrue(CountingSearchTestTask.numSearches > 0);
    long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
    assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
  }


  // disabled until we fix BG thread prio -- this test
  // causes build to hang
  public void testBGSearchTaskThreads() throws Exception {
    String algLines[] = {
        "log.time.step.msec = 100",
        "log.step=100000",
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 1000",
        "ForceMerge(1)",
        "CloseIndex",
        "OpenReader",
        "{",
        "  [ \"XSearch\" { CountingSearchTest > : * ] : 2 &-1",
        "  Wait(0.5)",
        "}",
        "CloseReader",
        "RepSumByPref X"
    };


    CountingSearchTestTask.numSearches = 0;
    execBenchmark(algLines);


    // NOTE: cannot assert this, because on a super-slow
    // system, it could be after waiting 0.5 seconds that
    // the search threads hadn't yet succeeded in starting
    // up and then they start up and do no searching:
    //assertTrue(CountingSearchTestTask.numSearches > 0);
  }


  public void testHighlighting() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "doc.stored=true",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "query.maker=" + ReutersQueryMaker.class.getName(),
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 100",
        "ForceMerge(1)",
        "CloseIndex",
        "OpenReader",
        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
        "CloseReader",
    };


    // 2. we test this value later
    CountingHighlighterTestTask.numHighlightedResults = 0;
    CountingHighlighterTestTask.numDocsRetrieved = 0;
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 4. test specific checks after the benchmark run completed.
    assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
    //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
    //we probably should use a different doc/query maker, but...
    assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);


    assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
    // now we should be able to open the index for write.
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
    iw.close();
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
    ir.close();
  }


  public void testHighlightingTV() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "doc.stored=true",//doc storage is required in order to have text to highlight
        "doc.term.vector=true",
        "doc.term.vector.offsets=true",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "query.maker=" + ReutersQueryMaker.class.getName(),
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 1000",
        "ForceMerge(1)",
        "CloseIndex",
        "OpenReader",
        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
        "CloseReader",
    };


    // 2. we test this value later
    CountingHighlighterTestTask.numHighlightedResults = 0;
    CountingHighlighterTestTask.numDocsRetrieved = 0;
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 4. test specific checks after the benchmark run completed.
    assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
    //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
    //we probably should use a different doc/query maker, but...
    assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);


    assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
    // now we should be able to open the index for write.
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
    iw.close();
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
    ir.close();
  }


  public void testHighlightingNoTvNoStore() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "doc.stored=false",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "query.maker=" + ReutersQueryMaker.class.getName(),
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 1000",
        "ForceMerge(1)",
        "CloseIndex",
        "OpenReader",
        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
        "CloseReader",
    };


    // 2. we test this value later
    CountingHighlighterTestTask.numHighlightedResults = 0;
    CountingHighlighterTestTask.numDocsRetrieved = 0;
    // 3. execute the algorithm  (required in every "logic" test)
    try {
      Benchmark benchmark = execBenchmark(algLines);
      assertTrue("CountingHighlighterTest should have thrown an exception", false);
      assertNotNull(benchmark); // (avoid compile warning on unused variable)
    } catch (Exception e) {
      assertTrue(true);
    }
  }


  /**
   * Test Exhasting Doc Maker logic
   */
  public void testExhaustContentSource() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
        "content.source.log.step=1",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "# ----- alg ",
        "CreateIndex",
        "{ AddDoc } : * ",
        "ForceMerge(1)",
        "CloseIndex",
        "OpenReader",
        "{ CountingSearchTest } : 100",
        "CloseReader",
        "[ CountingSearchTest > : 30",
        "[ CountingSearchTest > : 9",
    };
    
    // 2. we test this value later
    CountingSearchTestTask.numSearches = 0;
    
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 4. test specific checks after the benchmark run completed.
    assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
    assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
    // now we should be able to open the index for write. 
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
    iw.close();
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
    ir.close();
  }


  // LUCENE-1994: test thread safety of SortableSingleDocMaker
  public void testDocMakerThreadSafety() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource",
        "doc.term.vector=false",
        "log.step.AddDoc=10000",
        "content.source.forever=true",
        "directory=RAMDirectory",
        "doc.reuse.fields=false",
        "doc.stored=false",
        "doc.tokenized=false",
        "doc.index.props=true",
        "# ----- alg ",
        "CreateIndex",
        "[ { AddDoc > : 250 ] : 4",
        "CloseIndex",
    };
    
    // 2. we test this value later
    CountingSearchTestTask.numSearches = 0;
    
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    DirectoryReader r = DirectoryReader.open(benchmark.getRunData().getDirectory());
    SortedDocValues idx = FieldCache.DEFAULT.getTermsIndex(SlowCompositeReaderWrapper.wrap(r), "country");
    final int maxDoc = r.maxDoc();
    assertEquals(1000, maxDoc);
    for(int i=0;i<1000;i++) {
      assertTrue("doc " + i + " has null country", idx.getOrd(i) != -1);
    }
    r.close();
  }


  /**
   * Test Parallel Doc Maker logic (for LUCENE-940)
   */
  public void testParallelDocMaker() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=FSDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "# ----- alg ",
        "CreateIndex",
        "[ { AddDoc } : * ] : 4 ",
        "CloseIndex",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 3. test number of docs in the index
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }


  /**
   * Test WriteLineDoc and LineDocSource.
   */
  public void testLineDocFile() throws Exception {
    File lineFile = new File(TEMP_DIR, "test.reuters.lines.txt");


    // We will call WriteLineDocs this many times
    final int NUM_TRY_DOCS = 50;


    // Creates a line file with first 50 docs from SingleDocSource
    String algLines1[] = {
      "# ----- properties ",
      "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
      "content.source.forever=true",
      "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
      "# ----- alg ",
      "{WriteLineDoc()}:" + NUM_TRY_DOCS,
    };


    // Run algo
    Benchmark benchmark = execBenchmark(algLines1);


    BufferedReader r = new BufferedReader(
        new InputStreamReader(
            new FileInputStream(lineFile), "UTF-8"));
    int numLines = 0;
    String line;
    while((line = r.readLine()) != null) {
      if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
        continue; // do not count the header line as a doc 
      }
      numLines++;
    }
    r.close();
    assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);
    
    // Index the line docs
    String algLines2[] = {
      "# ----- properties ",
      "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
      "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
      "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
      "content.source.forever=false",
      "doc.reuse.fields=false",
      "ram.flush.mb=4",
      "# ----- alg ",
      "ResetSystemErase",
      "CreateIndex",
      "{AddDoc}: *",
      "CloseIndex",
    };
    
    // Run algo
    benchmark = execBenchmark(algLines2);


    // now we should be able to open the index for write. 
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
            .setOpenMode(OpenMode.APPEND));
    iw.close();


    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
    ir.close();


    lineFile.delete();
  }
  
  /**
   * Test ReadTokensTask
   */
  public void testReadTokens() throws Exception {


    // We will call ReadTokens on this many docs
    final int NUM_DOCS = 20;


    // Read tokens from first NUM_DOCS docs from Reuters and
    // then build index from the same docs
    String algLines1[] = {
      "# ----- properties ",
      "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
      "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
      "docs.file=" + getReuters20LinesFile(),
      "# ----- alg ",
      "{ReadTokens}: " + NUM_DOCS,
      "ResetSystemErase",
      "CreateIndex",
      "{AddDoc}: " + NUM_DOCS,
      "CloseIndex",
    };


    // Run algo
    Benchmark benchmark = execBenchmark(algLines1);


    List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();


    // Count how many tokens all ReadTokens saw
    int totalTokenCount1 = 0;
    for (final TaskStats stat : stats) {
      if (stat.getTask().getName().equals("ReadTokens")) {
        totalTokenCount1 += stat.getCount();
      }
    }


    // Separately count how many tokens are actually in the index:
    IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
    assertEquals(NUM_DOCS, reader.numDocs());


    int totalTokenCount2 = 0;


    Fields fields = MultiFields.getFields(reader);


    for (String fieldName : fields) {
      if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
        continue;
      }
      Terms terms = fields.terms(fieldName);
      if (terms == null) {
        continue;
      }
      TermsEnum termsEnum = terms.iterator(null);
      DocsEnum docs = null;
      while(termsEnum.next() != null) {
        docs = _TestUtil.docs(random(), termsEnum, MultiFields.getLiveDocs(reader), docs, DocsEnum.FLAG_FREQS);
        while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          totalTokenCount2 += docs.freq();
        }
      }
    }
    reader.close();


    // Make sure they are the same
    assertEquals(totalTokenCount1, totalTokenCount2);
  }
  
  /**
   * Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941)
   */
  public void testParallelExhausted() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "task.max.depth.log=1",
        "# ----- alg ",
        "CreateIndex",
        "{ [ AddDoc]: 4} : * ",
        "ResetInputs ",
        "{ [ AddDoc]: 4} : * ",
        "WaitForMerges",
        "CloseIndex",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 3. test number of docs in the index
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    int ndocsExpected = 2 * 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }




  /**
   * Test that exhaust in loop works as expected (LUCENE-1115).
   */
  public void testExhaustedLooped() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "task.max.depth.log=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  WaitForMerges",
        "  CloseIndex",
        "} : 2",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 3. test number of docs in the index
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    int ndocsExpected = 20;  // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }
  
  /**
   * Test that we can close IndexWriter with argument "false".
   */
  public void testCloseIndexFalse() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "ram.flush.mb=-1",
        "max.buffered=2",
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  CloseIndex(false)",
        "} : 2",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 3. test number of docs in the index
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }


  public static class MyMergeScheduler extends SerialMergeScheduler {
    boolean called;
    public MyMergeScheduler() {
      super();
      called = true;
    }
  }


  /**
   * Test that we can set merge scheduler".
   */
  public void testMergeScheduler() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "merge.scheduler=" + MyMergeScheduler.class.getName(),
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "} : 2",
    };
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    assertTrue("did not use the specified MergeScheduler",
        ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getConfig()
            .getMergeScheduler()).called);
    benchmark.getRunData().getIndexWriter().close();


    // 3. test number of docs in the index
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }


  public static class MyMergePolicy extends LogDocMergePolicy {
    boolean called;
    public MyMergePolicy() {
      called = true;
    }
  }
  
  /**
   * Test that we can set merge policy".
   */
  public void testMergePolicy() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "ram.flush.mb=-1",
        "max.buffered=2",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "merge.policy=" + MyMergePolicy.class.getName(),
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "} : 2",
    };


    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);
    assertTrue("did not use the specified MergePolicy", ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getConfig().getMergePolicy()).called);
    benchmark.getRunData().getIndexWriter().close();
    
    // 3. test number of docs in the index
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }


  /**
   * Test that IndexWriter settings stick.
   */
  public void testIndexWriterSettings() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "ram.flush.mb=-1",
        "max.buffered=2",
        "compound=cmpnd:true:false",
        "doc.term.vector=vector:false:true",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "merge.factor=3",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  NewRound",
        "} : 2",
    };


    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);
    final IndexWriter writer = benchmark.getRunData().getIndexWriter();
    assertEquals(2, writer.getConfig().getMaxBufferedDocs());
    assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
    assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
    assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0);
    writer.close();
    Directory dir = benchmark.getRunData().getDirectory();
    IndexReader reader = DirectoryReader.open(dir);
    Fields tfv = reader.getTermVectors(0);
    assertNotNull(tfv);
    assertTrue(tfv.size() > 0);
    reader.close();
  }


  /**
   * Test indexing with facets tasks.
   */
  public void testIndexingWithFacets() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=100",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "merge.factor=3",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "ResetSystemErase",
        "CreateIndex",
        "CreateTaxonomyIndex",
        "{ \"AddDocs\"  AddFacetedDoc > : * ",
        "CloseIndex",
        "CloseTaxonomyIndex",
        "OpenTaxonomyReader",
    };


    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);
    PerfRunData runData = benchmark.getRunData();
    assertNull("taxo writer was not properly closed",runData.getTaxonomyWriter());
    TaxonomyReader taxoReader = runData.getTaxonomyReader();
    assertNotNull("taxo reader was not opened", taxoReader);
    assertTrue("nothing was added to the taxnomy (expecting root and at least one addtional category)",taxoReader.getSize()>1);
    taxoReader.close();
  }
  
  /**
   * Test that we can call forceMerge(maxNumSegments).
   */
  public void testForceMerge() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "ram.flush.mb=-1",
        "max.buffered=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "merge.policy=org.apache.lucene.index.LogDocMergePolicy",
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  ForceMerge(3)",
        "  CloseIndex()",
        "} : 2",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 3. test number of docs in the index
    IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();


    // Make sure we have 3 segments:
    SegmentInfos infos = new SegmentInfos();
    infos.read(benchmark.getRunData().getDirectory());
    assertEquals(3, infos.size());
  }
  
  /**
   * Test disabling task count (LUCENE-1136).
   */
  public void testDisableCounting() throws Exception {
    doTestDisableCounting(true);
    doTestDisableCounting(false);
  }


  private void doTestDisableCounting(boolean disable) throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = disableCountingLines(disable);
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);


    // 3. test counters
    int n = disable ? 0 : 1;
    int nChecked = 0;
    for (final TaskStats stats : benchmark.getRunData().getPoints().taskStats()) {
      String taskName = stats.getTask().getName();
      if (taskName.equals("Rounds")) {
        assertEquals("Wrong total count!",20+2*n,stats.getCount());
        nChecked++;
      } else if (taskName.equals("CreateIndex")) {
        assertEquals("Wrong count for CreateIndex!",n,stats.getCount());
        nChecked++;
      } else if (taskName.equals("CloseIndex")) {
        assertEquals("Wrong count for CloseIndex!",n,stats.getCount());
        nChecked++;
      }
    }
    assertEquals("Missing some tasks to check!",3,nChecked);
  }


  private String[] disableCountingLines (boolean disable) {
    String dis = disable ? "-" : "";
    return new String[] {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=30",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "task.max.depth.log=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  "+dis+"CreateIndex",            // optionally disable counting here
        "  { \"AddDocs\"  AddDoc > : * ",
        "  "+dis+"  CloseIndex",             // optionally disable counting here (with extra blanks)
        "}",
        "RepSumByName",
    };
  }


  /**
   * Test that we can change the Locale in the runData,
   * that it is parsed as we expect.
   */
  public void testLocale() throws Exception {
    // empty Locale: clear it (null)
    Benchmark benchmark = execBenchmark(getLocaleConfig(""));
    assertNull(benchmark.getRunData().getLocale());


    // ROOT locale
    benchmark = execBenchmark(getLocaleConfig("ROOT"));
    assertEquals(new Locale(""), benchmark.getRunData().getLocale());
    
    // specify just a language 
    benchmark = execBenchmark(getLocaleConfig("de"));
    assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
    
    // specify language + country
    benchmark = execBenchmark(getLocaleConfig("en,US"));
    assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
    
    // specify language + country + variant
    benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
    assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
  }
   
  private String[] getLocaleConfig(String localeParam) {
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  NewLocale(" + localeParam + ")",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  NewRound",
        "} : 1",
    };
    return algLines;
  }
  
  /**
   * Test that we can create CollationAnalyzers.
   */
  public void testCollator() throws Exception {
    // ROOT locale
    Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
    CollationKeyAnalyzer expected = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator
        .getInstance(new Locale("")));
    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
    
    // specify just a language
    benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
    expected = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("de")));
    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
    
    // specify language + country
    benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
    expected = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("en",
        "US")));
    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
    
    // specify language + country + variant
    benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
    expected = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("no",
        "NO", "NY")));
    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
  }
  
  private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
      throws Exception {
    TokenStream ts1 = a1.tokenStream("bogus", text);
    TokenStream ts2 = a2.tokenStream("bogus", text);
    ts1.reset();
    ts2.reset();
    TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class);
    TermToBytesRefAttribute termAtt2 = ts2.addAttribute(TermToBytesRefAttribute.class);
    assertTrue(ts1.incrementToken());
    assertTrue(ts2.incrementToken());
    BytesRef bytes1 = termAtt1.getBytesRef();
    BytesRef bytes2 = termAtt2.getBytesRef();
    termAtt1.fillBytesRef();
    termAtt2.fillBytesRef();
    assertEquals(bytes1, bytes2);
    assertFalse(ts1.incrementToken());
    assertFalse(ts2.incrementToken());
    ts1.close();
    ts2.close();
  }
  
  private String[] getCollatorConfig(String localeParam, 
      String collationParam) {
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  NewLocale(" + localeParam + ")",
        "  NewCollationAnalyzer(" + collationParam + ")",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  NewRound",
        "} : 1",
    };
    return algLines;
  }
  
  /**
   * Test that we can create shingle analyzers using AnalyzerFactory.
   */
  public void testShingleAnalyzer() throws Exception {
    String text = "one,two,three, four five six";
    
    // StandardTokenizer, maxShingleSize, and outputUnigrams
    Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
        ("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
    benchmark.getRunData().getAnalyzer().tokenStream
        ("bogus", text).close();
    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
                                             new String[] { "one", "one two", "two", "two three",
                                                            "three", "three four", "four", "four five",
                                                            "five", "five six", "six" });
    // StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false
    benchmark = execBenchmark
      (getAnalyzerFactoryConfig
          ("shingle-analyzer",
           "StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)"));
    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
                                             new String[] { "one two", "one two three", "two three",
                                                            "two three four", "three four",
                                                            "three four five", "four five",
                                                            "four five six", "five six" });
    // WhitespaceTokenizer, default maxShingleSize and outputUnigrams
    benchmark = execBenchmark
      (getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter"));
    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
                                             new String[] { "one,two,three,", "one,two,three, four",
                                                            "four", "four five", "five", "five six",
                                                            "six" });
    
    // WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false
    benchmark = execBenchmark
      (getAnalyzerFactoryConfig
        ("shingle-factory",
         "WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)"));
    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
                                             new String[] { "one,two,three, four",
                                                            "one,two,three, four five",
                                                            "four five", "four five six",
                                                            "five six" });
  }
  
  private String[] getAnalyzerFactoryConfig(String name, String params) {
    final String singleQuoteEscapedName = name.replaceAll("'", "\\\\'");
    String algLines[] = {
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "work.dir=" + getWorkDir().getAbsolutePath().replaceAll("\\\\", "/"), // Fix Windows path
        "content.source.forever=false",
        "directory=RAMDirectory",
        "AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + params + ")",
        "NewAnalyzer('" + singleQuoteEscapedName + "')",
        "CreateIndex",
        "{ \"AddDocs\"  AddDoc > : * "
    };
    return algLines;
  }


  public void testAnalyzerFactory() throws Exception {
    String text = "Fortieth, Quarantième, Cuadragésimo";
    Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
        ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
         "positionIncrementGap:100,offsetGap:1111,"
         +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
         +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
         +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
        new String[] { "fo", "or", "rt", "ti", "ie", "et", "th",
                       "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe",
                       "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"});
  }
  
  private String getReuters20LinesFile() {
    return getWorkDirResourcePath("reuters.first20.lines.txt");
  }  
}
Source Code of org.apache.lucene.benchmark.byTask.TestPerfTasksLogic$MyMergePolicy

Related Classes of org.apache.lucene.benchmark.byTask.TestPerfTasksLogic$MyMergePolicy