Set<String> idSet = new HashSet<String>();
idSet.addAll(ids);
int numDocs = reader.numDocs();
OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
log.info("Populating term infos from the index");
/**
* This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
*
* Since we have deleted the documents out of the cluster, the document frequency for a term should only
* include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
* in the entire index. To get the in-cluster frequency, we need to query the index to get the term
* frequencies in each document. The number of results of this call will be the in-cluster document
* frequency.
*/
TermEnum te = reader.terms(new Term(contentField, ""));
int count = 0;
Map<String,TermEntry> termEntryMap = new LinkedHashMap<String,TermEntry>();
do {
Term term = te.term();
if (term == null || term.field().equals(contentField) == false) {
break;
}
OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
// Generate bitset for the term
TermDocs termDocs = reader.termDocs(term);
while (termDocs.next()) {
termBitset.set(termDocs.doc());
}
// AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
// This modifies the termBitset, but that's fine as we are not using it anywhere else.
termBitset.and(clusterDocBitset);
int inclusterDF = (int) termBitset.cardinality();
TermEntry entry = new TermEntry(term.text(), count++, inclusterDF);
termEntryMap.put(entry.term, entry);
} while (te.next());
te.close();