package org.leskes.test.elasticfacets.fields;
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.elasticsearch.common.lucene.DocumentBuilder;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.trove.set.hash.TIntHashSet;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.field.data.FieldData.OrdinalInDocProc;
import org.leskes.elasticfacets.fields.HashedStringFieldData;
import org.leskes.elasticfacets.fields.HashedStringFieldType;
import org.leskes.elasticfacets.fields.MultiValueHashedStringFieldData;
import org.leskes.elasticfacets.fields.SingleValueHashedStringFieldData;
import org.testng.AssertJUnit;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.regex.Pattern;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
public class HashedStringFieldDataTest {
protected void assertHash(String A, String B) {
AssertJUnit.assertEquals("Hash code of " + A
+ " doesn't equal the one of " + B,
HashedStringFieldType.hashCode(A),
HashedStringFieldType.hashCode(B));
}
protected void assertHash(int A, String B) {
AssertJUnit.assertEquals("Hash code doesn't equal the one of " + B, A,
HashedStringFieldType.hashCode(B));
}
protected void assertHash(String A, int B) {
AssertJUnit.assertEquals("Hash code doesn't equal the one of " + A,
HashedStringFieldType.hashCode(A), B);
}
protected void assertHash(ArrayList<Integer> values, String A) {
assertThat(values, hasItem(HashedStringFieldType.hashCode(A)));
}
protected void assertFieldWithSet(HashedStringFieldData field,int docId,String[] set){
assertThat(field.hasValue(docId),equalTo(set.length>0));
final ArrayList<Integer> values = new ArrayList<Integer>();
int missing = set.length >0 ? 0 : 1;
assertThat(getDocHashes(docId, field, values),equalTo(missing));
assertThat(values.size(),equalTo(set.length));
for (String s : set ){
assertHash(values,s);
}
assertThat(getDocOrdinals(docId, field, values),equalTo(missing));
assertThat(values.size(),equalTo(set.length));
}
@Test
public void singleValueHashedStringFieldDataTests() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
Lucene.VERSION, Lucene.STANDARD_ANALYZER));
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("svalue", "zzz")).build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("svalue", "xxx")).build());
indexWriter.addDocument(DocumentBuilder.doc().build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("svalue", "aaa")).build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("svalue", "aaa")).build());
IndexReader reader = IndexReader.open(indexWriter, true);
HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,0, null, null));
SingleValueHashedStringFieldData sFieldData = (SingleValueHashedStringFieldData) type.load(reader, "svalue");
assert (sFieldData.fieldName().equals("svalue"));
assert (!sFieldData.multiValued());
assertThat(sFieldData.collisions(),equalTo(0));
assertHash("zzz", sFieldData.hashValue(0));
assertFieldWithSet(sFieldData, 0, new String[] { "zzz"});
assertHash("xxx", sFieldData.hashValue(1));
assertFieldWithSet(sFieldData, 1, new String[] { "xxx"});
assertFieldWithSet(sFieldData, 2, new String[] {});
assertHash("aaa", sFieldData.hashValue(3));
assertFieldWithSet(sFieldData, 3, new String[] { "aaa"});
assertHash("aaa", sFieldData.hashValue(4));
assertFieldWithSet(sFieldData, 4, new String[] { "aaa"});
indexWriter.close();
}
@Test
public void singleValueHashedStringFieldData100Entires() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
Lucene.VERSION, Lucene.STANDARD_ANALYZER));
indexWriter.addDocument(DocumentBuilder.doc().build());
for (int i = 0; i < 100; i++)
indexWriter.addDocument(DocumentBuilder
.doc()
.add(DocumentBuilder.field("svalue",
String.format("term_%s", i))).build());
IndexReader reader = IndexReader.open(indexWriter, true);
HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,0, null, null));
SingleValueHashedStringFieldData sFieldData = (SingleValueHashedStringFieldData) type.load(reader, "svalue");
assertThat(sFieldData.fieldName(), equalTo("svalue"));
assertThat(sFieldData.multiValued(), equalTo(false));
assertThat(sFieldData.collisions(),equalTo(0));
int[] sortedValues = Arrays.copyOf(sFieldData.values(),
sFieldData.values().length);
Arrays.sort(sortedValues,1,sortedValues.length);
assertThat("Internal values of field data are not sorted!",
sFieldData.values(), equalTo(sortedValues));
assertThat(sFieldData.hasValue(0), equalTo(false));// first doc had no
// value!
for (int i = 0; i < 100; i++) {
int docId = i + 1;
assertThat(sFieldData.hasValue(docId), equalTo(true));
String term = String.format("term_%s", i);
assertHash(term, sFieldData.hashValue(docId));
final ArrayList<Integer> values = new ArrayList<Integer>();
getDocHashes(docId, sFieldData, values);
assertThat(values.size(), equalTo(1));
assertHash(values, term);
values.clear();
}
indexWriter.close();
}
@Test
public void multiValueHashedStringFieldDataTests() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
Lucene.VERSION, Lucene.STANDARD_ANALYZER));
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "zzz"))
.add(DocumentBuilder.field("mvalue", "xxx")).build());
indexWriter.addDocument(DocumentBuilder.doc().build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "aaa")).build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "aaa")).build());
IndexReader reader = IndexReader.open(indexWriter, true);
HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,0, null, null));
MultiValueHashedStringFieldData sFieldData = (MultiValueHashedStringFieldData) type.load(reader, "mvalue");
assert (sFieldData.fieldName().equals("mvalue"));
assert (sFieldData.multiValued());
assertThat(sFieldData.collisions(),equalTo(0));
assertFieldWithSet(sFieldData, 0, new String[] { "zzz","xxx"});
assertFieldWithSet(sFieldData, 1, new String[] { });
assertFieldWithSet(sFieldData, 2, new String[] { "aaa" });
assertFieldWithSet(sFieldData, 3, new String[] { "aaa" });
indexWriter.close();
}
@Test
public void multiValueHashedStringExcludeTermsTests() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
Lucene.VERSION, Lucene.STANDARD_ANALYZER));
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "zzz"))
.add(DocumentBuilder.field("mvalue", "xxx"))
.add(DocumentBuilder.field("mvalue", "123")).build());
indexWriter.addDocument(DocumentBuilder.doc().build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "aaa")).build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "aaa")).build());
IndexReader reader = IndexReader.open(indexWriter, true);
TIntHashSet excludeTerms = new TIntHashSet();
excludeTerms.add(HashedStringFieldType.hashCode("xxx"));
Pattern excludePattern = Pattern.compile("\\d{3}|a"); // the a is to test full token matching
HashedStringFieldType type = new HashedStringFieldType(
new HashedStringFieldData.HashedStringTypeLoader(0,0, excludePattern,excludeTerms));
// we exclude the only multi value, expect a single value array
SingleValueHashedStringFieldData sFieldData = (SingleValueHashedStringFieldData) type.load(reader, "mvalue");
assert (sFieldData.fieldName().equals("mvalue"));
assert (!sFieldData.multiValued());
assertThat(sFieldData.collisions(),equalTo(0));
assertFieldWithSet(sFieldData, 0, new String[] { "zzz"});
assertFieldWithSet(sFieldData, 1, new String[] { });
assertFieldWithSet(sFieldData, 2, new String[] { "aaa" });
assertFieldWithSet(sFieldData, 3, new String[] { "aaa" });
indexWriter.close();
}
@Test
public void TestMultiValueCollisionDetection() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
Lucene.VERSION, new PatternAnalyzer(Version.LUCENE_36, PatternAnalyzer.NON_WORD_PATTERN, false, null)));
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "FB"))
.add(DocumentBuilder.field("mvalue", "Ea")).build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "BB"))
.add(DocumentBuilder.field("mvalue", "Aa")).build());
IndexReader reader = IndexReader.open(indexWriter, true);
HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,0, null, null));
MultiValueHashedStringFieldData sFieldData = (MultiValueHashedStringFieldData) type.load(reader, "mvalue");
assertThat(sFieldData.collisions(),equalTo(2));
indexWriter.close();
}
@Test
public void TestMultiValueMaxTermsPerDoc() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
Lucene.VERSION, new PatternAnalyzer(Version.LUCENE_36, PatternAnalyzer.WHITESPACE_PATTERN, false, null)));
DocumentBuilder d = DocumentBuilder.doc();
for (int i=0;i<200;i++) d.add(DocumentBuilder.field("mvalue", "t" + i));
indexWriter.addDocument(d.build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "t1"))
.add(DocumentBuilder.field("mvalue", "t2")).build());
IndexReader reader = IndexReader.open(indexWriter, true);
HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(199,0, null, null));
MultiValueHashedStringFieldData sFieldData = (MultiValueHashedStringFieldData)type.load(reader, "mvalue");
assertThat(sFieldData.hasValue(0),equalTo(false));
assertThat(sFieldData.hasValue(1),equalTo(true));
indexWriter.close();
}
@Test
public void TestMultiValueMinDocsPerTerm() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(
Lucene.VERSION, new PatternAnalyzer(Version.LUCENE_36, PatternAnalyzer.WHITESPACE_PATTERN, false, null)));
DocumentBuilder d = DocumentBuilder.doc();
d.add(DocumentBuilder.field("mvalue", "t1"));
indexWriter.addDocument(d.build());
indexWriter.addDocument(DocumentBuilder.doc()
.add(DocumentBuilder.field("mvalue", "t1"))
.add(DocumentBuilder.field("mvalue", "t2")).build());
IndexReader reader = IndexReader.open(indexWriter, true);
HashedStringFieldType type = new HashedStringFieldType(new HashedStringFieldData.HashedStringTypeLoader(0,2, null, null));
SingleValueHashedStringFieldData sFieldData = (SingleValueHashedStringFieldData) type.load(reader, "mvalue");
assertThat(sFieldData.hasValue(0),equalTo(true));
final int[] count = {0};
sFieldData.forEachValueInDoc(0, new HashedStringFieldData.HashedStringValueInDocProc() {
@Override
public void onValue(int docId, int Hash) {
count[0]++;
}
@Override
public void onMissing(int docId) {
}
});
assertThat(count[0], equalTo(1));
count[0]=0;
sFieldData.forEachValueInDoc(1, new HashedStringFieldData.HashedStringValueInDocProc() {
@Override
public void onValue(int docId, int Hash) {
count[0]++;
}
@Override
public void onMissing(int docId) {
}
});
assertThat(sFieldData.hasValue(1), equalTo(true));
assertThat(count[0], equalTo(1));
indexWriter.close();
}
protected int getDocHashes(int docId, HashedStringFieldData sFieldData,
final ArrayList<Integer> values) {
values.clear();
final ArrayList<Integer> missing = new ArrayList<Integer>();
sFieldData.forEachValueInDoc(docId,
new HashedStringFieldData.HashedStringValueInDocProc() {
public void onValue(int docId, int Hash) {
values.add(Hash);
}
public void onMissing(int docId) {
missing.add(docId);
}
});
return missing.size();
}
protected int getDocOrdinals(final int docId, HashedStringFieldData sFieldData,
final ArrayList<Integer> values) {
values.clear();
final ArrayList<Integer> missing = new ArrayList<Integer>();
sFieldData.forEachOrdinalInDoc(docId,
new OrdinalInDocProc() {
public void onOrdinal(int int_docId, int ordinal) {
assertThat(int_docId,equalTo(docId));
if (ordinal == 0)
missing.add(int_docId);
else
values.add(ordinal);
}
});
return missing.size();
}
@Test
public void ResolvingHashTest(){
NamedAnalyzer analyzer = Lucene.STANDARD_ANALYZER;
String value = "Some text with spaces";
String term = HashedStringFieldData.analyzeStringForTerm(value,
HashedStringFieldType.hashCode("spaces"), "field", analyzer);
assertThat(term,equalTo("spaces"));
value = "Some other text with spaces and more";
term = HashedStringFieldData.analyzeStringForTerm(value,
HashedStringFieldType.hashCode("more"), "field", analyzer);
assertThat(term,equalTo("more"));
}
}