Package org.leskes.test.elasticfacets.fields

Source Code of org.leskes.test.elasticfacets.fields.HashedStringFieldDataTest

package org.leskes.test.elasticfacets.fields;

import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.elasticsearch.common.lucene.DocumentBuilder;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.trove.set.hash.TIntHashSet;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.field.data.FieldData.OrdinalInDocProc;
import org.leskes.elasticfacets.fields.HashedStringFieldData;
import org.leskes.elasticfacets.fields.HashedStringFieldType;
import org.leskes.elasticfacets.fields.MultiValueHashedStringFieldData;
import org.leskes.elasticfacets.fields.SingleValueHashedStringFieldData;
import org.testng.AssertJUnit;
import org.testng.annotations.Test;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.regex.Pattern;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;

public class HashedStringFieldDataTest {
  protected void assertHash(String A, String B) {
    AssertJUnit.assertEquals("Hash code of " + A
        + " doesn't equal the one of " + B,
        HashedStringFieldType.hashCode(A),
        HashedStringFieldType.hashCode(B));
  }

  protected void assertHash(int A, String B) {
    AssertJUnit.assertEquals("Hash code doesn't equal the one of " + B, A,
        HashedStringFieldType.hashCode(B));
  }

  protected void assertHash(String A, int B) {
    AssertJUnit.assertEquals("Hash code doesn't equal the one of " + A,
        HashedStringFieldType.hashCode(A), B);
  }

  protected void assertHash(ArrayList<Integer> values, String A) {
    assertThat(values, hasItem(HashedStringFieldType.hashCode(A)));
  }
 
  protected void assertFieldWithSet(HashedStringFieldData field,int docId,String[] set){
   
    assertThat(field.hasValue(docId),equalTo(set.length>0));
   
    final ArrayList<Integer> values = new ArrayList<Integer>();
    int missing = set.length >0 ? 0 : 1;
    assertThat(getDocHashes(docId, field, values),equalTo(missing));
    assertThat(values.size(),equalTo(set.length));
    for (String s : set ){
      assertHash(values,s);
    }

    assertThat(getDocOrdinals(docId, field, values),equalTo(missing));
    assertThat(values.size(),equalTo(set.length));

  }

  @Test
  public void singleValueHashedStringFieldDataTests() throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
        Lucene.VERSION, Lucene.STANDARD_ANALYZER));

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("svalue", "zzz")).build());

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("svalue", "xxx")).build());

    indexWriter.addDocument(DocumentBuilder.doc().build());

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("svalue", "aaa")).build());

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("svalue", "aaa")).build());

    IndexReader reader = IndexReader.open(indexWriter, true);

      HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,0, null, null));
    SingleValueHashedStringFieldData sFieldData = (SingleValueHashedStringFieldData) type.load(reader, "svalue");

    assert (sFieldData.fieldName().equals("svalue"));
    assert (!sFieldData.multiValued());
    assertThat(sFieldData.collisions(),equalTo(0));


    assertHash("zzz", sFieldData.hashValue(0));
    assertFieldWithSet(sFieldData, 0, new String[] { "zzz"});

    assertHash("xxx", sFieldData.hashValue(1));
    assertFieldWithSet(sFieldData, 1, new String[] { "xxx"});

    assertFieldWithSet(sFieldData, 2, new String[] {});

    assertHash("aaa", sFieldData.hashValue(3));
    assertFieldWithSet(sFieldData, 3, new String[] { "aaa"});

    assertHash("aaa", sFieldData.hashValue(4));
    assertFieldWithSet(sFieldData, 4, new String[] { "aaa"});
   

    indexWriter.close();
  }

  @Test
  public void singleValueHashedStringFieldData100Entires() throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
        Lucene.VERSION, Lucene.STANDARD_ANALYZER));

    indexWriter.addDocument(DocumentBuilder.doc().build());

    for (int i = 0; i < 100; i++)
      indexWriter.addDocument(DocumentBuilder
          .doc()
          .add(DocumentBuilder.field("svalue",
              String.format("term_%s", i))).build());

    IndexReader reader = IndexReader.open(indexWriter, true);

      HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,0, null, null));

      SingleValueHashedStringFieldData sFieldData = (SingleValueHashedStringFieldData) type.load(reader, "svalue");

    assertThat(sFieldData.fieldName(), equalTo("svalue"));
    assertThat(sFieldData.multiValued(), equalTo(false));
    assertThat(sFieldData.collisions(),equalTo(0));


    int[] sortedValues = Arrays.copyOf(sFieldData.values(),
        sFieldData.values().length);
    Arrays.sort(sortedValues,1,sortedValues.length);
    assertThat("Internal values of field data are not sorted!",
        sFieldData.values(), equalTo(sortedValues));

    assertThat(sFieldData.hasValue(0), equalTo(false));// first doc had no
                              // value!

    for (int i = 0; i < 100; i++) {
      int docId = i + 1;
      assertThat(sFieldData.hasValue(docId), equalTo(true));
      String term = String.format("term_%s", i);
      assertHash(term, sFieldData.hashValue(docId));

      final ArrayList<Integer> values = new ArrayList<Integer>();
      getDocHashes(docId, sFieldData, values);
      assertThat(values.size(), equalTo(1));

      assertHash(values, term);

      values.clear();
    }
   


    indexWriter.close();
  }

  @Test
  public void multiValueHashedStringFieldDataTests() throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
        Lucene.VERSION, Lucene.STANDARD_ANALYZER));

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("mvalue", "zzz"))
        .add(DocumentBuilder.field("mvalue", "xxx")).build());

    indexWriter.addDocument(DocumentBuilder.doc().build());

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("mvalue", "aaa")).build());

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("mvalue", "aaa")).build());

    IndexReader reader = IndexReader.open(indexWriter, true);

      HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,0, null, null));
      MultiValueHashedStringFieldData sFieldData = (MultiValueHashedStringFieldData) type.load(reader, "mvalue");

    assert (sFieldData.fieldName().equals("mvalue"));
    assert (sFieldData.multiValued());
    assertThat(sFieldData.collisions(),equalTo(0));


   
    assertFieldWithSet(sFieldData, 0, new String[] { "zzz","xxx"});

    assertFieldWithSet(sFieldData, 1, new String[] { });

    assertFieldWithSet(sFieldData, 2, new String[] { "aaa" });
 
    assertFieldWithSet(sFieldData, 3, new String[] { "aaa" });

    indexWriter.close();
  }

   @Test
   public void multiValueHashedStringExcludeTermsTests() throws Exception {
      Directory dir = new RAMDirectory();
      IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
              Lucene.VERSION, Lucene.STANDARD_ANALYZER));

      indexWriter.addDocument(DocumentBuilder.doc()
              .add(DocumentBuilder.field("mvalue", "zzz"))
              .add(DocumentBuilder.field("mvalue", "xxx"))
              .add(DocumentBuilder.field("mvalue", "123")).build());

      indexWriter.addDocument(DocumentBuilder.doc().build());

      indexWriter.addDocument(DocumentBuilder.doc()
              .add(DocumentBuilder.field("mvalue", "aaa")).build());

      indexWriter.addDocument(DocumentBuilder.doc()
              .add(DocumentBuilder.field("mvalue", "aaa")).build());

      IndexReader reader = IndexReader.open(indexWriter, true);

      TIntHashSet excludeTerms = new TIntHashSet();
      excludeTerms.add(HashedStringFieldType.hashCode("xxx"));

      Pattern excludePattern = Pattern.compile("\\d{3}|a"); // the a is to test full token matching

      HashedStringFieldType type = new HashedStringFieldType(
              new HashedStringFieldData.HashedStringTypeLoader(0,0, excludePattern,excludeTerms));

      // we exclude the only multi value, expect a single value array
      SingleValueHashedStringFieldData sFieldData = (SingleValueHashedStringFieldData) type.load(reader, "mvalue");

      assert (sFieldData.fieldName().equals("mvalue"));
      assert (!sFieldData.multiValued());
      assertThat(sFieldData.collisions(),equalTo(0));



      assertFieldWithSet(sFieldData, 0, new String[] { "zzz"});

      assertFieldWithSet(sFieldData, 1, new String[] { });

      assertFieldWithSet(sFieldData, 2, new String[] { "aaa" });

      assertFieldWithSet(sFieldData, 3, new String[] { "aaa" });

      indexWriter.close();
   }
 
  @Test
  public void TestMultiValueCollisionDetection() throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
        Lucene.VERSION, new PatternAnalyzer(Version.LUCENE_36, PatternAnalyzer.NON_WORD_PATTERN, false, null)));

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("mvalue", "FB"))
        .add(DocumentBuilder.field("mvalue", "Ea")).build());

    indexWriter.addDocument(DocumentBuilder.doc()
        .add(DocumentBuilder.field("mvalue", "BB"))
        .add(DocumentBuilder.field("mvalue", "Aa")).build());

   
    IndexReader reader = IndexReader.open(indexWriter, true);

      HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,0, null, null));

      MultiValueHashedStringFieldData sFieldData = (MultiValueHashedStringFieldData) type.load(reader, "mvalue");
   
    assertThat(sFieldData.collisions(),equalTo(2));
    indexWriter.close();

  }

   @Test
   public void TestMultiValueMaxTermsPerDoc() throws Exception {
      Directory dir = new RAMDirectory();
      IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
              Lucene.VERSION, new PatternAnalyzer(Version.LUCENE_36, PatternAnalyzer.WHITESPACE_PATTERN, false, null)));

      DocumentBuilder d = DocumentBuilder.doc();

      for (int i=0;i<200;i++) d.add(DocumentBuilder.field("mvalue", "t" + i));

      indexWriter.addDocument(d.build());

      indexWriter.addDocument(DocumentBuilder.doc()
              .add(DocumentBuilder.field("mvalue", "t1"))
              .add(DocumentBuilder.field("mvalue", "t2")).build());


      IndexReader reader = IndexReader.open(indexWriter, true);

      HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(199,0, null, null));

      MultiValueHashedStringFieldData sFieldData = (MultiValueHashedStringFieldData)type.load(reader, "mvalue");

      assertThat(sFieldData.hasValue(0),equalTo(false));
      assertThat(sFieldData.hasValue(1),equalTo(true));
      indexWriter.close();

   }

   @Test
   public void TestMultiValueMinDocsPerTerm() throws Exception {
      Directory dir = new RAMDirectory();
      IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
              Lucene.VERSION, new PatternAnalyzer(Version.LUCENE_36, PatternAnalyzer.WHITESPACE_PATTERN, false, null)));

      DocumentBuilder d = DocumentBuilder.doc();

      d.add(DocumentBuilder.field("mvalue", "t1"));

      indexWriter.addDocument(d.build());

      indexWriter.addDocument(DocumentBuilder.doc()
              .add(DocumentBuilder.field("mvalue", "t1"))
              .add(DocumentBuilder.field("mvalue", "t2")).build());


      IndexReader reader = IndexReader.open(indexWriter, true);


      HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,2, null, null));

      SingleValueHashedStringFieldData sFieldData = (SingleValueHashedStringFieldData) type.load(reader, "mvalue");

      assertThat(sFieldData.hasValue(0),equalTo(true));
      final int[] count = {0};
      sFieldData.forEachValueInDoc(0, new HashedStringFieldData.HashedStringValueInDocProc() {
         @Override
         public void onValue(int docId, int Hash) {
            count[0]++;
         }

         @Override
         public void onMissing(int docId) {

         }
      });
      assertThat(count[0], equalTo(1));

      count[0]=0;
      sFieldData.forEachValueInDoc(1, new HashedStringFieldData.HashedStringValueInDocProc() {
         @Override
         public void onValue(int docId, int Hash) {
            count[0]++;
         }

         @Override
         public void onMissing(int docId) {

         }
      });
      assertThat(sFieldData.hasValue(1), equalTo(true));
      assertThat(count[0], equalTo(1));
      indexWriter.close();

   }



   protected int getDocHashes(int docId, HashedStringFieldData sFieldData,
      final ArrayList<Integer> values) {
    values.clear();
    final ArrayList<Integer> missing = new ArrayList<Integer>();
    sFieldData.forEachValueInDoc(docId,
        new HashedStringFieldData.HashedStringValueInDocProc() {
          public void onValue(int docId, int Hash) {
            values.add(Hash);
          }

          public void onMissing(int docId) {
            missing.add(docId);
          }
        });
    return missing.size();
  }
 
  protected int getDocOrdinals(final int docId, HashedStringFieldData sFieldData,
      final ArrayList<Integer> values) {
    values.clear();
    final ArrayList<Integer> missing = new ArrayList<Integer>();
    sFieldData.forEachOrdinalInDoc(docId,
        new OrdinalInDocProc() {
       
          public void onOrdinal(int int_docId, int ordinal) {
            assertThat(int_docId,equalTo(docId));
            if (ordinal == 0)
              missing.add(int_docId);
            else
              values.add(ordinal);
           
          }
        });
    return missing.size();
  }

   @Test
   public void ResolvingHashTest(){
      NamedAnalyzer analyzer = Lucene.STANDARD_ANALYZER;
      String value = "Some text with spaces";
      String term = HashedStringFieldData.analyzeStringForTerm(value,
              HashedStringFieldType.hashCode("spaces"), "field", analyzer);
      assertThat(term,equalTo("spaces"));
      value = "Some other text with spaces and more";
      term = HashedStringFieldData.analyzeStringForTerm(value,
              HashedStringFieldType.hashCode("more"), "field", analyzer);
      assertThat(term,equalTo("more"));
   }

}
TOP

Related Classes of org.leskes.test.elasticfacets.fields.HashedStringFieldDataTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.