Package org.sindice.siren.index.codecs.siren10

Source Code of org.sindice.siren.index.codecs.siren10.TestSiren10PostingsFormat

/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
*  https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*  http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.index.codecs.siren10;

import static org.sindice.siren.analysis.MockSirenDocument.doc;
import static org.sindice.siren.analysis.MockSirenToken.node;
import static org.sindice.siren.analysis.MockSirenToken.token;

import java.io.IOException;

import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
import org.sindice.siren.analysis.MockSirenDocument;
import org.sindice.siren.index.DocsAndNodesIterator;
import org.sindice.siren.index.codecs.RandomSirenCodec.PostingsFormatType;
import org.sindice.siren.index.codecs.siren10.Siren10PostingsReader.Siren10DocsEnum;
import org.sindice.siren.index.codecs.siren10.Siren10PostingsReader.Siren10DocsNodesAndPositionsEnum;
import org.sindice.siren.util.BasicSirenTestCase;

public class TestSiren10PostingsFormat extends BasicSirenTestCase {

  @Override
  protected void configure() throws IOException {
    this.setAnalyzer(AnalyzerType.MOCK);
    this.setPostingsFormat(PostingsFormatType.SIREN_10);
  }

  @Test
  public void testSimpleNextDocument() throws IOException {
    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))),
      doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0))),
      doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7)))
    );

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, "aaa"));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();
    assertEquals(-1, e.doc());
    assertEquals(0, e.nodeFreqInDoc());
    assertTrue(e.nextDocument());
    assertEquals(0, e.doc());
    assertEquals(2, e.nodeFreqInDoc());
    assertTrue(e.nextDocument());
    assertEquals(1, e.doc());
    assertEquals(1, e.nodeFreqInDoc());
    assertTrue(e.nextDocument());
    assertEquals(2, e.doc());
    assertEquals(1, e.nodeFreqInDoc());

    assertFalse(e.nextDocument());
    assertEquals(DocsAndNodesIterator.NO_MORE_DOC, e.doc());
  }

  @Test
  public void testSkipDoc() throws IOException {
    final MockSirenDocument[] docs = new MockSirenDocument[2048];
    for (int i = 0; i < 2048; i += 4) {
      docs[i] = doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2)));
      docs[i + 1] = doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0)));
      docs[i + 2] = doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7)));
      docs[i + 3] = doc(token("bbb", node(2,0)), token("aaa", node(5,3,6)));
    }
    this.addDocuments(docs);

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();

    // first skip in skiplist is at 512
    assertTrue(e.skipTo(502));
    assertEquals(502, e.doc());
    assertEquals(1, e.nodeFreqInDoc());

    // must have used the second skip
    assertTrue(e.skipTo(1624));
    assertEquals(1624, e.doc());
    assertEquals(2, e.nodeFreqInDoc());

    // no other skip, must have used the linear scan
    assertTrue(e.skipTo(2000));
    assertEquals(2000, e.doc());
    assertEquals(2, e.nodeFreqInDoc());

    assertFalse(e.skipTo(256323));

  }

  @Test
  public void testSimpleNextNode() throws IOException {
    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))),
      doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0))),
      doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7)))
    );

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();
    assertEquals(-1, e.doc());
    assertEquals(0, e.nodeFreqInDoc());
    assertEquals(node(-1), e.node());

    assertTrue(e.nextDocument());
    assertEquals(0, e.doc());
    assertEquals(2, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(node(1), e.node());
    assertTrue(e.nextNode());
    assertEquals(node(2), e.node());
    assertFalse(e.nextNode());
    assertEquals(DocsAndNodesIterator.NO_MORE_NOD, e.node());

    assertTrue(e.nextDocument());
    assertEquals(1, e.doc());
    assertEquals(1, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(node(1,0), e.node());
    assertFalse(e.nextNode());
    assertEquals(DocsAndNodesIterator.NO_MORE_NOD, e.node());

    assertTrue(e.nextDocument());
    assertEquals(2, e.doc());
    assertEquals(1, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(node(5,3,6,3), e.node());
    assertFalse(e.nextNode());
    assertEquals(DocsAndNodesIterator.NO_MORE_NOD, e.node());

    assertFalse(e.nextDocument());
    assertEquals(DocsAndNodesIterator.NO_MORE_DOC, e.doc());
  }

  @Test
  public void testSimpleSkipNode() throws IOException {
    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))),
      doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0))),
      doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7)))
    );

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();
    assertEquals(-1, e.doc());
    assertEquals(0, e.nodeFreqInDoc());

    // skip to 2 using linear scan. Node should be also be skipped.
    assertTrue(e.skipTo(2));
    assertEquals(2, e.doc());
    assertEquals(1, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(node(5,3,6,3), e.node());
    assertFalse(e.nextNode());

    assertFalse(e.nextDocument());
  }

  @Test
  public void testSkipNode() throws IOException {
    final MockSirenDocument[] docs = new MockSirenDocument[2048];
    for (int i = 0; i < 2048; i += 4) {
      docs[i] = doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2)));
      docs[i + 1] = doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0)));
      docs[i + 2] = doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7)));
      docs[i + 3] = doc(token("bbb", node(2,0)), token("aaa", node(5,3,6)));
    }
    this.addDocuments(docs);

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();

    // first skip in skiplist is at 512
    assertTrue(e.skipTo(502));
    assertEquals(502, e.doc());
    assertEquals(1, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(node(5,3,6,3), e.node());
    assertFalse(e.nextNode());

    // skip to 504 and scan partially nodes
    assertTrue(e.nextDocument());
    assertTrue(e.nextDocument());
    assertEquals(504, e.doc());
    assertEquals(2, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(node(1), e.node());

    // must have used the second skip
    assertTrue(e.skipTo(1624));
    assertEquals(1624, e.doc());
    assertEquals(2, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(node(1), e.node());
    assertTrue(e.nextNode());
    assertEquals(node(2), e.node());
    assertFalse(e.nextNode());

    // no other skip, must have used the linear scan
    assertTrue(e.skipTo(2000));
    assertEquals(2000, e.doc());
    assertEquals(2, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(node(1), e.node());
    assertTrue(e.nextNode());
    assertEquals(node(2), e.node());
    assertFalse(e.nextNode());

    assertFalse(e.skipTo(256323));

  }

  @Test
  public void testSimpleNextPosition() throws IOException {
    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))),
      doc(token("bbb", node(1,0)), token("bbb", node(1,0,1,0))),
      doc(token("bbb", node(5,3,6)), token("aaa", node(5,3,6,3)), token("aaa", node(5,3,6,3)))
    );

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();
    assertEquals(-1, e.doc());
    assertEquals(0, e.nodeFreqInDoc());
    assertEquals(node(-1), e.node());
    assertEquals(-1, e.pos());

    assertTrue(e.nextDocument());
    assertEquals(0, e.doc());
    assertEquals(2, e.nodeFreqInDoc());

    assertTrue(e.nextNode());
    assertEquals(node(1), e.node());
    assertEquals(1, e.termFreqInNode());

    assertTrue(e.nextPosition());
    assertEquals(0, e.pos());
    assertFalse(e.nextPosition());

    assertTrue(e.nextNode());
    assertEquals(node(2), e.node());
    assertEquals(1, e.termFreqInNode());

    assertTrue(e.nextPosition());
    assertEquals(0, e.pos());
    assertFalse(e.nextPosition());

    assertFalse(e.nextNode());

    assertTrue(e.nextDocument());
    assertEquals(2, e.doc());
    assertEquals(1, e.nodeFreqInDoc());

    assertTrue(e.nextNode());
    assertEquals(node(5,3,6,3), e.node());
    assertEquals(2, e.termFreqInNode());

    assertTrue(e.nextPosition());
    assertEquals(0, e.pos());
    assertTrue(e.nextPosition());
    assertEquals(1, e.pos());
    assertFalse(e.nextPosition());

    assertFalse(e.nextNode());

    assertFalse(e.nextDocument());
  }

  @Test
  public void testSimpleFrequencies() throws IOException {
    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))),
      doc(token("aaa", node(1)), token("aaa", node(1)), token("aaa", node(2)))
    );

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();
    assertEquals(-1, e.doc());

    // freqs should be set to 0 at the beginning
    assertEquals(0, e.nodeFreqInDoc());
    assertEquals(0, e.termFreqInNode());

    // nodeFreqInDoc should be set after calling nextDocument
    assertTrue(e.nextDocument());
    assertEquals(2, e.nodeFreqInDoc());
    // termFreqInNode should be set to 0
    assertEquals(0, e.termFreqInNode());
    // calling termFreqInNode should not change the freq settings
    assertEquals(2, e.nodeFreqInDoc());

    // termFreqInNode should be set after calling nextNode
    assertTrue(e.nextNode());
    // nodeFreqInDoc and nodeFreqInDoc should not have changed of settings
    assertEquals(2, e.nodeFreqInDoc());
    // termFreqInNode should be set to 1
    assertEquals(1, e.termFreqInNode());
    // calling termFreqInNode should not change the freqs settings
    assertEquals(2, e.nodeFreqInDoc());

    // calling nextPosition should not change freqs settings
    assertTrue(e.nextPosition());
    assertEquals(2, e.nodeFreqInDoc());
    assertEquals(1, e.termFreqInNode());

    // partially scanned position should not have consequences on nodeFreqInDoc
    // settings
    assertTrue(e.nextDocument());
    assertEquals(2, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(2, e.termFreqInNode());
    assertTrue(e.nextPosition());
    assertEquals(2, e.termFreqInNode());
    assertTrue(e.nextNode());
    assertEquals(1, e.termFreqInNode());
  }

  @Test
  public void testSimpleMerge() throws IOException {
    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2)))
    );
    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2)))
    );

    this.forceMerge();

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();

    assertTrue(e.nextDocument());
    assertEquals(0, e.doc());
    assertEquals(2, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(1, e.termFreqInNode());
    assertTrue(e.nextPosition());
    assertEquals(0, e.pos());
    assertTrue(e.nextNode());
    assertEquals(1, e.termFreqInNode());
    assertTrue(e.nextPosition());
    assertEquals(0, e.pos());

    assertTrue(e.nextDocument());
    assertEquals(1, e.doc());
    assertEquals(2, e.nodeFreqInDoc());
    assertTrue(e.nextNode());
    assertEquals(1, e.termFreqInNode());
    assertTrue(e.nextPosition());
    assertEquals(0, e.pos());
    assertTrue(e.nextNode());
    assertEquals(1, e.termFreqInNode());
    assertTrue(e.nextPosition());
    assertEquals(0, e.pos());
  }

  @Test
  public void testMergeBlockSize() throws IOException {
    // reduce block size
    this.setPostingsFormat(new Siren10VIntPostingsFormat(2));

    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0))),
      doc(token("aaa", node(1)), token("bbb", node(1,0))),
      doc(token("aaa", node(1)), token("bbb", node(1,0)))
    );

    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0))),
      doc(token("aaa", node(1)), token("bbb", node(1,0)))
    );

    this.forceMerge();
  }

  @Test
  public void testStressMerge() throws IOException {
    this.addDocuments(
      doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2)))
    );

    while (this.reader.numDocs() < 10000) {
      final int batchSize = LuceneTestCase.random().nextInt(20);
      final MockSirenDocument[] docs = new MockSirenDocument[batchSize];
      for (int i = 0; i < batchSize; i++) {
        docs[i] = doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2)));
      }
      this.addDocuments(docs);
      this.forceMerge();
    }

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();

    for (int i = 0; i < reader.numDocs(); i++) {
      assertTrue(e.nextDocument());
      assertEquals(i, e.doc());
      assertEquals(2, e.nodeFreqInDoc());

      assertTrue(e.nextNode());
      assertEquals(node(1), e.node());
      assertEquals(1, e.termFreqInNode());
      assertTrue(e.nextPosition());
      assertEquals(0, e.pos());

      assertTrue(e.nextNode());
      assertEquals(node(2), e.node());
      assertEquals(1, e.termFreqInNode());
      assertTrue(e.nextPosition());
      assertEquals(0, e.pos());

      assertFalse(e.nextNode());
    }
  }

  @Test
  public void testSkipDataCheckIndex() throws IOException {
    // The Lucene CheckIndex was catching a problem with how skip data level
    // were computed on this configuration.
    this.setPostingsFormat(new Siren10VIntPostingsFormat(256));

    final MockSirenDocument[] docs = new MockSirenDocument[1000];

    for (int i = 0; i < 1000; i++) {
     docs[i] = doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2)));
    }
    this.addDocuments(docs);

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
  }

  @Test
  public void testDeltaNode() throws IOException {
    final MockSirenDocument[] docs = new MockSirenDocument[2048];
    for (int i = 0; i < 2048; i += 2) {
      docs[i] = doc(token("aaa", node(1,1)), token("aaa", node(2,1)), token("aaa", node(2,5)));
      docs[i + 1] = doc(token("aaa", node(5,3,1)), token("aaa", node(5,3,6,3)),
                        token("aaa", node(5,3,6,5)), token("aaa", node(6)));
    }
    this.addDocuments(docs);

    final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader);
    final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa")));
    assertTrue(docsEnum instanceof Siren10DocsEnum);
    final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum();

    for (int i = 0; i < 2048; i += 2) {
      assertTrue(e.nextDocument());
      assertTrue(e.nextNode());
      assertEquals(node(1,1), e.node());
      assertTrue(e.nextNode());
      assertEquals(node(2,1), e.node());
      assertTrue(e.nextNode());
      assertEquals(node(2,5), e.node());
      assertTrue(e.nextDocument());
      assertTrue(e.nextNode());
      assertEquals(node(5,3,1), e.node());
      assertTrue(e.nextNode());
      assertEquals(node(5,3,6,3), e.node());
      assertTrue(e.nextNode());
      assertEquals(node(5,3,6,5), e.node());
      assertTrue(e.nextNode());
      assertEquals(node(6), e.node());
    }
  }

}
TOP

Related Classes of org.sindice.siren.index.codecs.siren10.TestSiren10PostingsFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.