Package org.sindice.siren.search.node

Source Code of org.sindice.siren.search.node.TestNodeFuzzyQuery

/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
*  https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*  http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.search.node;

import static org.sindice.siren.search.AbstractTestSirenScorer.dq;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.junit.Test;
import org.sindice.siren.analysis.AnyURIAnalyzer;
import org.sindice.siren.analysis.TupleAnalyzer;
import org.sindice.siren.index.codecs.RandomSirenCodec.PostingsFormatType;
import org.sindice.siren.search.node.MultiNodeTermQuery;
import org.sindice.siren.search.node.NodeFuzzyQuery;
import org.sindice.siren.search.node.MultiNodeTermQuery.TopTermsBoostOnlyNodeBooleanQueryRewrite;
import org.sindice.siren.util.BasicSirenTestCase;
import org.sindice.siren.util.XSDDatatype;

/**
* Tests copied from {@link FuzzyQuery} for the siren use case.
*/
public class TestNodeFuzzyQuery extends BasicSirenTestCase {

  @Override
  protected void configure() throws IOException {
    final AnyURIAnalyzer uriAnalyzer = new AnyURIAnalyzer(TEST_VERSION_CURRENT);
    final TupleAnalyzer tupleAnalyzer = new TupleAnalyzer(TEST_VERSION_CURRENT,
      new WhitespaceAnalyzer(TEST_VERSION_CURRENT), uriAnalyzer);
    tupleAnalyzer.registerDatatype(XSDDatatype.XSD_ANY_URI.toCharArray(), uriAnalyzer);
    this.setAnalyzer(tupleAnalyzer);
    this.setPostingsFormat(PostingsFormatType.RANDOM);
  }

  @Test
  public void testFuzziness() throws Exception {
    this.addDocument("<aaaaa>");
    this.addDocument("<aaaab>");
    this.addDocument("<aaabb>");
    this.addDocument("<aabbb>");
    this.addDocument("<abbbb>");
    this.addDocument("<bbbbb>");
    this.addDocument("<ddddd>");

    LuceneProxyNodeQuery query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaaa"), NodeFuzzyQuery.defaultMaxEdits, 0));
    ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);

    // same with prefix
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaaa"), NodeFuzzyQuery.defaultMaxEdits, 1));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaaa"), NodeFuzzyQuery.defaultMaxEdits, 2));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaaa"), NodeFuzzyQuery.defaultMaxEdits, 3));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaaa"), NodeFuzzyQuery.defaultMaxEdits, 4));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(2, hits.length);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaaa"), NodeFuzzyQuery.defaultMaxEdits, 5));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaaa"), NodeFuzzyQuery.defaultMaxEdits, 6));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);

    // test scoring
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "bbbbb"), NodeFuzzyQuery.defaultMaxEdits, 0));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals("3 documents should match", 3, hits.length);
    List<Integer> order = Arrays.asList(5,4,3);
    for (int i = 0; i < hits.length; i++) {
      assertEquals((int) order.get(i), hits[i].doc);
    }

    // test pq size by supplying maxExpansions=2
    // This query would normally return 3 documents, because 3 terms match (see above):
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "bbbbb"), NodeFuzzyQuery.defaultMaxEdits, 0, 2, false));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals("only 2 documents should match", 2, hits.length);
    order = Arrays.asList(5,4);
    for (int i = 0; i < hits.length; i++) {
      assertEquals((int) order.get(i), hits[i].doc);
    }

    // not similar enough:
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "xxxxx"), NodeFuzzyQuery.defaultMaxEdits, 0));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(0, hits.length);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaccc"), NodeFuzzyQuery.defaultMaxEdits, 0));   // edit distance to "aaaaa" = 3
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(0, hits.length);

    // query identical to a word in the index:
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaaa"), NodeFuzzyQuery.defaultMaxEdits, 0));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    assertEquals(0, hits[0].doc);
    // default allows for up to two edits:
    assertEquals(1, hits[1].doc);
    assertEquals(2, hits[2].doc);

    // query similar to a word in the index:
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaac"), NodeFuzzyQuery.defaultMaxEdits, 0));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    assertEquals(0, hits[0].doc);
    assertEquals(1, hits[1].doc);
    assertEquals(2, hits[2].doc);

    // now with prefix
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaac"), NodeFuzzyQuery.defaultMaxEdits, 1));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    assertEquals(0, hits[0].doc);
    assertEquals(1, hits[1].doc);
    assertEquals(2, hits[2].doc);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaac"), NodeFuzzyQuery.defaultMaxEdits, 2));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    assertEquals(0, hits[0].doc);
    assertEquals(1, hits[1].doc);
    assertEquals(2, hits[2].doc);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaac"), NodeFuzzyQuery.defaultMaxEdits, 3));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    assertEquals(0, hits[0].doc);
    assertEquals(1, hits[1].doc);
    assertEquals(2, hits[2].doc);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaac"), NodeFuzzyQuery.defaultMaxEdits, 4));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(2, hits.length);
    assertEquals(0, hits[0].doc);
    assertEquals(1, hits[1].doc);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "aaaac"), NodeFuzzyQuery.defaultMaxEdits, 5));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(0, hits.length);


    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "ddddX"), NodeFuzzyQuery.defaultMaxEdits, 0));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);
    assertEquals(6, hits[0].doc);

    // now with prefix
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "ddddX"), NodeFuzzyQuery.defaultMaxEdits, 1));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);
    assertEquals(6, hits[0].doc);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "ddddX"), NodeFuzzyQuery.defaultMaxEdits, 2));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);
    assertEquals(6, hits[0].doc);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "ddddX"), NodeFuzzyQuery.defaultMaxEdits, 3));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);
    assertEquals(6, hits[0].doc);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "ddddX"), NodeFuzzyQuery.defaultMaxEdits, 4));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);
    assertEquals(6, hits[0].doc);
    query = dq(new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "ddddX"), NodeFuzzyQuery.defaultMaxEdits, 5));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(0, hits.length);


    // different field = no match:
    query = dq(new NodeFuzzyQuery(new Term("anotherfield", "ddddX"), NodeFuzzyQuery.defaultMaxEdits, 0));
    hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals(0, hits.length);
  }

  /**
   * MultiTermQuery provides (via attribute) information about which values
   * must be competitive to enter the priority queue.
   *
   * FuzzyQuery optimizes itself around this information, if the attribute
   * is not implemented correctly, there will be problems!
   */
  public void testTieBreaker() throws Exception {
    this.addDocument("<a123456>");
    this.addDocument("<c123456>");
    this.addDocument("<d123456>");
    this.addDocument("<e123456>");

    final Directory directory2 = newDirectory();
    final RandomIndexWriter writer2 = newRandomIndexWriter(directory2, analyzer, codec);
    addDocument(writer2, "<a123456>");
    addDocument(writer2, "<b123456>");
    addDocument(writer2, "<b123456>");
    addDocument(writer2, "<b123456>");
    addDocument(writer2, "<c123456>");
    addDocument(writer2, "<f123456>");

    final IndexReader ir1 = writer.getReader();
    final IndexReader ir2 = writer2.getReader();

    final MultiReader mr = new MultiReader(ir1, ir2);
    final IndexSearcher searcher = newSearcher(mr);
    final FuzzyQuery fq = new FuzzyQuery(new Term(DEFAULT_TEST_FIELD, "z123456"), 1, 0, 2, false);
    final TopDocs docs = searcher.search(fq, 2);
    assertEquals(5, docs.totalHits); // 5 docs, from the a and b's

    mr.close();
    ir2.close();
    writer2.close();
    directory2.close();
  }

  /** Test the {@link TopTermsBoostOnlyNodeBooleanQueryRewrite} rewrite method. */
  @Test
  public void testBoostOnlyRewrite() throws Exception {
    this.addDocument("<Lucene>");
    this.addDocument("<Lucene>");
    this.addDocument("<Lucenne>");

    final NodeFuzzyQuery query = new NodeFuzzyQuery(new Term(DEFAULT_TEST_FIELD, "Lucene"));
    query.setRewriteMethod(new MultiNodeTermQuery.TopTermsBoostOnlyNodeBooleanQueryRewrite(50));
    final ScoreDoc[] hits = searcher.search(dq(query), null, 1000).scoreDocs;
    assertEquals(3, hits.length);
  }

}
TOP

Related Classes of org.sindice.siren.search.node.TestNodeFuzzyQuery

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.