package org.xbib.elasticsearch.skywalker.reconstruct;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.CompositeReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.xbib.elasticsearch.action.skywalker.support.IndexableFieldToXContent;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
/**
* This class attempts to reconstruct all fields from a document existing in a
* Lucene index. This operation may be (and usually) is lossy - e.g. unstored
* fields are rebuilt from terms present in the index, and these terms may have
* been changed (e.g. lowercased, stemmed), and many other input tokens may have
* been skipped altogether by the Analyzer, when fields were originally added to
* the index.
*
*/
public class DocumentReconstructor {
private AtomicReader reader;
/**
* Prepare a document reconstructor.
*
* @param indexReader IndexReader to read from.
* @throws Exception
*/
public DocumentReconstructor(IndexReader indexReader) {
if (indexReader == null) {
throw new ElasticsearchIllegalArgumentException("reader cannot be null");
}
try {
if (indexReader instanceof CompositeReader) {
this.reader = SlowCompositeReaderWrapper.wrap(indexReader);
} else if (indexReader instanceof AtomicReader) {
this.reader = (AtomicReader) indexReader;
} else {
throw new ElasticsearchIllegalArgumentException("unsupported IndexReader class " + indexReader.getClass().getName());
}
} catch (IOException e) {
throw new ElasticsearchIllegalArgumentException(e.getMessage());
}
}
/**
* Reconstruct an index shard
*
* @return reconstructed document
* @throws Exception
*/
public XContentBuilder reconstruct(int shardId) throws IOException {
XContentBuilder builder = jsonBuilder();
builder.startObject()
.field("shardId", shardId)
.field("numDeletions", reader.numDeletedDocs());
builder.startArray("docs");
FieldInfos fieldInfos = reader.getFieldInfos();
Bits live = MultiFields.getLiveDocs(reader);
for (int docNum = 0; docNum < reader.maxDoc(); docNum++) {
Document doc = reader.document(docNum);
if (live != null && live.get(docNum)) {
continue; // not deleted
}
builder.startObject().startArray("fields");
if (fieldInfos != null) {
for (FieldInfo fi : fieldInfos) {
String name = fi.name;
IndexableField[] fs = doc.getFields(name);
if (fs != null && fs.length > 0) {
for (IndexableField f : fs) {
IndexableFieldToXContent x = new IndexableFieldToXContent().field(f);
x.toXContent(builder, ToXContent.EMPTY_PARAMS);
}
}
}
}
builder.endArray();
builder.startArray("terms");
if (fieldInfos != null) {
TermsEnum te = null;
DocsAndPositionsEnum dpe = null;
for (FieldInfo fi : fieldInfos) {
Terms terms = MultiFields.getTerms(reader, fi.name);
if (terms == null) { // no terms in this field
continue;
}
te = terms.iterator(te);
while (te.next() != null) {
DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0);
if (newDpe == null) { // no position info for this field
break;
}
dpe = newDpe;
int num = dpe.advance(docNum);
if (num != docNum) { // either greater than or NO_MORE_DOCS
continue; // no data for this term in this doc
}
String text = te.term().utf8ToString();
List<Integer> positions = new ArrayList();
List<Integer> starts = new ArrayList();
List<Integer> ends = new ArrayList();
for (int k = 0; k < dpe.freq(); k++) {
int pos = dpe.nextPosition();
positions.add(pos);
starts.add(dpe.startOffset());
ends.add(dpe.endOffset());
}
builder.startObject()
.field("text", text)
.field("positions", positions)
.field("starts", starts)
.field("ends", ends)
.field("count", dpe.freq())
.endObject();
}
}
}
builder.endArray();
builder.endObject();
}
builder.endArray();
builder.endObject();
return builder;
}
}