package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.util.zip.DataFormatException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.AbstractField;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.IOUtils;
import com.alimama.mdrill.fdtBlockCompress.FdtCompressIndexInput;
/**
* Class responsible for access to stored document fields.
* <p/>
* It uses <segment>.fdt and <segment>.fdx; files.
*/
final class FieldsReader implements Cloneable, Closeable {
private static final Log LOG = LogFactory.getLog(FieldsReader.class);
private final FieldInfos fieldInfos;
// The main fieldStream, used only for cloning.
private IndexInput cloneableFieldsStream;
// This is a clone of cloneableFieldsStream used for reading documents.
// It should not be cloned outside of a synchronized context.
private final IndexInput fieldsStream;
private final IndexInput cloneableIndexStream;
private final IndexInput indexStream;
private int numTotalDocs;
private int size;
private boolean closed;
private final int format;
private final int formatSize;
// The docID offset where our docs begin in the index
// file. This will be 0 if we have our own private file.
private int docStoreOffset;
private CloseableThreadLocal<IndexInput> fieldsStreamTL = new CloseableThreadLocal<IndexInput>();
private boolean isOriginal = false;
/** Returns a cloned FieldsReader that shares open
* IndexInputs with the original one. It is the caller's
* job not to close the original FieldsReader until all
* clones are called (eg, currently SegmentReader manages
* this logic). */
@Override
public Object clone() {
ensureOpen();
return new FieldsReader(fieldInfos, numTotalDocs, size, format, formatSize, docStoreOffset, cloneableFieldsStream, cloneableIndexStream);
}
/**
* Detects the code version this segment was written with. Returns either
* "2.x" for all pre-3.0 segments, or "3.0" for 3.0 segments. This method
* should not be called for 3.1+ segments since they already record their code
* version.
*/
static String detectCodeVersion(Directory dir, String segment) throws IOException {
IndexInput idxStream = dir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_INDEX_EXTENSION), 1024);
try {
int format = idxStream.readInt();
if(format==FieldsWriterCompress.FORMAT_CURRENT) {
return "3.0";
}else if (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) {
return "2.x";
} else {
return "3.0";
}
} finally {
idxStream.close();
}
}
// Used only by clone
private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int formatSize,
int docStoreOffset, IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream) {
this.fieldInfos = fieldInfos;
this.numTotalDocs = numTotalDocs;
this.size = size;
this.format = format;
this.formatSize = formatSize;
this.docStoreOffset = docStoreOffset;
this.cloneableFieldsStream = cloneableFieldsStream;
this.cloneableIndexStream = cloneableIndexStream;
fieldsStream = (IndexInput) cloneableFieldsStream.clone();
indexStream = (IndexInput) cloneableIndexStream.clone();
}
FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);
}
FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException {
this(d, segment, fn, readBufferSize, -1, 0);
}
FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {
boolean success = false;
isOriginal = true;
try {
fieldInfos = fn;
cloneableFieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_EXTENSION), readBufferSize);
final String indexStreamFN = IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_INDEX_EXTENSION);
cloneableIndexStream = d.openInput(indexStreamFN, readBufferSize);
// First version of fdx did not include a format
// header, but, the first int will always be 0 in that
// case
int firstInt = cloneableIndexStream.readInt();
if (firstInt == 0)
format = 0;
else
format = firstInt;
if(format==FieldsWriterCompress.FORMAT_CURRENT)
{
formatSize = 4;
}else if (format > FieldsWriter.FORMAT_CURRENT)
{
throw new IndexFormatTooNewException(cloneableIndexStream, format, 0, FieldsWriter.FORMAT_CURRENT);
}
else if (format > FieldsWriter.FORMAT)
{
formatSize = 4;
}
else
{
formatSize = 0;
}
if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
cloneableFieldsStream.setModifiedUTF8StringsMode();
if(format==FieldsWriterCompress.FORMAT_CURRENT)
{
cloneableFieldsStream=new FdtCompressIndexInput(cloneableFieldsStream);
fieldsStream=(IndexInput) cloneableFieldsStream.clone();
}else{
fieldsStream = (IndexInput) cloneableFieldsStream.clone();
}
final long indexSize = cloneableIndexStream.length()-formatSize;
if (docStoreOffset != -1) {
// We read only a slice out of this shared fields file
this.docStoreOffset = docStoreOffset;
this.size = size;
// Verify the file is long enough to hold all of our
// docs
assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset;
} else {
this.docStoreOffset = 0;
this.size = (int) (indexSize >> 3);
}
indexStream = (IndexInput) cloneableIndexStream.clone();
numTotalDocs = (int) (indexSize >> 3);
success = true;
} finally {
// With lock-less commits, it's entirely possible (and
// fine) to hit a FileNotFound exception above. In
// this case, we want to explicitly close any subset
// of things that were opened so that we don't have to
// wait for a GC to do so.
if (!success) {
close();
}
}
}
/**
* @throws AlreadyClosedException if this FieldsReader is closed
*/
private void ensureOpen() throws AlreadyClosedException {
if (closed) {
throw new AlreadyClosedException("this FieldsReader is closed");
}
}
/**
* Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a
* lazy implementation of a Field. This means that the Fields values will not be accessible.
*
* @throws IOException
*/
public final void close() throws IOException {
if (!closed) {
if (isOriginal) {
IOUtils.close(fieldsStream, indexStream, fieldsStreamTL, cloneableFieldsStream, cloneableIndexStream);
} else {
IOUtils.close(fieldsStream, indexStream, fieldsStreamTL);
}
closed = true;
}
}
final int size() {
return size;
}
private final void seekIndex(int docID) throws IOException {
indexStream.seek(formatSize + (docID + docStoreOffset) * 8L);
}
boolean canReadRawDocs() {
// Disable reading raw docs in 2.x format, because of the removal of compressed
// fields in 3.0. We don't want rawDocs() to decode field bits to figure out
// if a field was compressed, hence we enforce ordinary (non-raw) stored field merges
// for <3.0 indexes.
return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
}
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
seekIndex(n);
long position = indexStream.readLong();
fieldsStream.seek(position);
Document doc = new Document();
int numFields = fieldsStream.readVInt();
// LOG.info("doc seek"+position+","+n+","+numFields);
out: for (int i = 0; i < numFields; i++) {
int fieldNumber = fieldsStream.readVInt();
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
int bits = fieldsStream.readByte() & 0xFF;
assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_COMPRESSED | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits);
boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true)
: "compressed fields are only allowed in indexes of version <= 2.9";
if(compressed)
{
LOG.info("compressed"+position+","+n+","+fieldNumber+"@"+i+"@"+fi.name+",bits="+bits);
}
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK;
switch (acceptField) {
case LOAD:
{
addField(doc, fi, binary, compressed, tokenize, numeric);
break;
}
case LOAD_AND_BREAK:
{
addField(doc, fi, binary, compressed, tokenize, numeric);
break out; //Get out of this loop
}
case LAZY_LOAD:
{
addFieldLazy(doc, fi, binary, compressed, tokenize, true, numeric);
break;
}
case LATENT:
{
addFieldLazy(doc, fi, binary, compressed, tokenize, false, numeric);
break;
}
case SIZE:
{
skipFieldBytes(binary, compressed, addFieldSize(doc, fi, binary, compressed, numeric));
break;
}
case SIZE_AND_BREAK:
{
addFieldSize(doc, fi, binary, compressed, numeric);
break out; //Get out of this loop
}
default:
{
skipField(binary, compressed, numeric);
}
}
}
return doc;
}
/** Returns the length in bytes of each raw document in a
* contiguous range of length numDocs starting with
* startDocID. Returns the IndexInput (the fieldStream),
* already seeked to the starting point for startDocID.*/
final IndexInput rawDocs(long[] posStartList,long[] posEndList, int startDocID, int numDocs) throws IOException {
seekIndex(startDocID);
long startOffset = indexStream.readLong();
long posStart = startOffset;
int count = 0;
while (count < numDocs) {
final long posEnd;
final int docID = docStoreOffset + startDocID + count + 1;
assert docID <= numTotalDocs;
if (docID < numTotalDocs)
{
posEnd = indexStream.readLong();
}
else
{
posEnd = -1;
}
posStartList[count] = posStart;
posEndList[count] = posEnd;
count++;
posStart = posEnd;
}
fieldsStream.seek(startOffset);
return fieldsStream;
}
/**
* Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
* This will have the most payoff on large fields.
*/
private void skipField(boolean binary, boolean compressed, int numeric) throws IOException {
final int numBytes;
switch(numeric) {
case 0:
numBytes = fieldsStream.readVInt();
break;
case FieldsWriter.FIELD_IS_NUMERIC_INT:
fieldsStream.readVVInt();
numBytes=0;
break;
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
fieldsStream.readVVVInt();
numBytes=0;
break;
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
fieldsStream.readVVLong();
numBytes=0;
break;
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
fieldsStream.readVVVLong();
numBytes=0;
break;
default:
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
}
skipFieldBytes(binary, compressed, numBytes);
}
private void skipFieldBytes(boolean binary, boolean compressed, int toRead)
throws IOException {
if (toRead <= 0) {
return;
}
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES
|| binary || compressed) {
byte[] skip = new byte[toRead];
fieldsStream.readBytes(skip, 0, toRead);
} else {
// We need to skip chars. This will slow us down, but still better
fieldsStream.skipChars(toRead);
}
}
private NumericField loadNumericField(FieldInfo fi, int numeric) throws IOException {
assert numeric != 0;
switch(numeric) {
case FieldsWriter.FIELD_IS_NUMERIC_INT:
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setIntValue(fieldsStream.readVVInt());
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setLongValue(fieldsStream.readVVLong());
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setFloatValue(Float.intBitsToFloat(fieldsStream.readVVVInt()));
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setDoubleValue(Double.longBitsToDouble(fieldsStream.readVVVLong()));
default:
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
}
}
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult, int numeric) throws IOException {
final AbstractField f;
if (binary) {
int toRead = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
f = new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult);
//Need to move the pointer ahead by toRead positions
// fieldsStream.seek(pointer + toRead);
fieldsStream.seek(pointer);
byte[] skip=new byte[toRead];
fieldsStream.readBytes(skip,0,toRead);
} else if (numeric != 0) {
f = loadNumericField(fi, numeric);
} else {
Field.Store store = Field.Store.YES;
Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
if (compressed) {
int toRead = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
f = new LazyField(fi.name, store, toRead, pointer, binary, compressed, cacheResult);
//skip over the part that we aren't loading
// fieldsStream.seek(pointer + toRead);
fieldsStream.seek(pointer);
byte[] skip=new byte[toRead];
fieldsStream.readBytes(skip,0,toRead);
} else {
int length = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
//Skip ahead of where we are by the length of what is stored
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
// fieldsStream.seek(pointer+length);
fieldsStream.seek(pointer);
byte[] skip=new byte[length];
fieldsStream.readBytes(skip,0,length);
} else {
fieldsStream.skipChars(length);
}
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed, cacheResult);
}
}
f.setOmitNorms(fi.omitNorms);
f.setIndexOptions(fi.indexOptions);
doc.add(f);
}
private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, int numeric) throws CorruptIndexException, IOException {
final AbstractField f;
//we have a binary stored field, and it may be compressed
if (binary) {
int toRead = fieldsStream.readVInt();
final byte[] b = new byte[toRead];
fieldsStream.readBytes(b, 0, b.length);
if (compressed) {
f = new Field(fi.name, uncompress(b));
} else {
f = new Field(fi.name, b);
}
} else if (numeric != 0) {
f = loadNumericField(fi, numeric);
} else {
Field.Store store = Field.Store.YES;
Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
if (compressed) {
int toRead = fieldsStream.readVInt();
final byte[] b = new byte[toRead];
fieldsStream.readBytes(b, 0, b.length);
f = new Field(fi.name, // field name
false,
new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
store,
index,
termVector);
} else {
f = new Field(fi.name, // name
false,
fieldsStream.readString(), // read value
store,
index,
termVector);
}
}
f.setIndexOptions(fi.indexOptions);
f.setOmitNorms(fi.omitNorms);
doc.add(f);
}
// Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
// Read just the size -- caller must skip the field content to continue reading fields
// Return the size in bytes or chars, depending on field type
private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed, int numeric) throws IOException {
final int bytesize, size;
switch(numeric) {
case 0:
size = fieldsStream.readVInt();
bytesize = (binary || compressed) ? size : 2*size;
break;
case FieldsWriter.FIELD_IS_NUMERIC_INT:
fieldsStream.readVVInt();
size=0;
bytesize=4;
break;
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
fieldsStream.readVVVInt();
size=0;
bytesize=4;
break;
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
fieldsStream.readVVLong();
size=0;
bytesize=8;
break;
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
fieldsStream.readVVVLong();
size=0;
bytesize=8;
break;
default:
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
}
byte[] sizebytes = new byte[4];
sizebytes[0] = (byte) (bytesize>>>24);
sizebytes[1] = (byte) (bytesize>>>16);
sizebytes[2] = (byte) (bytesize>>> 8);
sizebytes[3] = (byte) bytesize ;
doc.add(new Field(fi.name, sizebytes));
return size;
}
/**
* A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is
* loaded.
*/
private class LazyField extends AbstractField implements Fieldable {
private int toRead;
private long pointer;
/** @deprecated Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0. */
@Deprecated
private boolean isCompressed;
private boolean cacheResult;
public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary, boolean isCompressed, boolean cacheResult) {
super(name, store, Field.Index.NO, Field.TermVector.NO);
this.toRead = toRead;
this.pointer = pointer;
this.isBinary = isBinary;
this.cacheResult = cacheResult;
if (isBinary)
binaryLength = toRead;
lazy = true;
this.isCompressed = isCompressed;
}
public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary, boolean isCompressed, boolean cacheResult) {
super(name, store, index, termVector);
this.toRead = toRead;
this.pointer = pointer;
this.isBinary = isBinary;
this.cacheResult = cacheResult;
if (isBinary)
binaryLength = toRead;
lazy = true;
this.isCompressed = isCompressed;
}
private IndexInput getFieldStream() {
IndexInput localFieldsStream = fieldsStreamTL.get();
if (localFieldsStream == null) {
localFieldsStream = (IndexInput) cloneableFieldsStream.clone();
fieldsStreamTL.set(localFieldsStream);
}
return localFieldsStream;
}
/** The value of the field as a Reader, or null. If null, the String value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
public Reader readerValue() {
ensureOpen();
return null;
}
/** The value of the field as a TokenStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
public TokenStream tokenStreamValue() {
ensureOpen();
return null;
}
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
public String stringValue() {
ensureOpen();
if (isBinary)
return null;
else {
if (fieldsData == null) {
IndexInput localFieldsStream = getFieldStream();
String value;
try {
localFieldsStream.seek(pointer);
if (isCompressed) {
final byte[] b = new byte[toRead];
localFieldsStream.readBytes(b, 0, b.length);
value = new String(uncompress(b), "UTF-8");
} else {
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
byte[] bytes = new byte[toRead];
localFieldsStream.readBytes(bytes, 0, toRead);
value = new String(bytes, "UTF-8");
} else {
//read in chars b/c we already know the length we need to read
char[] chars = new char[toRead];
localFieldsStream.readChars(chars, 0, toRead);
value = new String(chars);
}
}
} catch (IOException e) {
throw new FieldReaderException(e);
}
if (cacheResult){
fieldsData = value;
}
return value;
} else{
return (String) fieldsData;
}
}
}
// public long getPointer() {
// ensureOpen();
// return pointer;
// }
//
// public void setPointer(long pointer) {
// ensureOpen();
// this.pointer = pointer;
// }
//
// public int getToRead() {
// ensureOpen();
// return toRead;
// }
//
// public void setToRead(int toRead) {
// ensureOpen();
// this.toRead = toRead;
// }
@Override
public byte[] getBinaryValue(byte[] result) {
ensureOpen();
if (isBinary) {
if (fieldsData == null) {
// Allocate new buffer if result is null or too small
final byte[] b;
byte[] value;
if (result == null || result.length < toRead)
b = new byte[toRead];
else
b = result;
IndexInput localFieldsStream = getFieldStream();
// Throw this IOException since IndexReader.document does so anyway, so probably not that big of a change for people
// since they are already handling this exception when getting the document
try {
localFieldsStream.seek(pointer);
localFieldsStream.readBytes(b, 0, toRead);
if (isCompressed == true) {
value = uncompress(b);
} else {
value = b;
}
} catch (IOException e) {
throw new FieldReaderException(e);
}
binaryOffset = 0;
binaryLength = toRead;
if (cacheResult == true){
fieldsData = value;
}
return value;
} else{
return (byte[]) fieldsData;
}
} else {
return null;
}
}
}
private byte[] uncompress(byte[] b)
throws CorruptIndexException {
try {
return CompressionTools.decompress(b);
} catch (DataFormatException e) {
// this will happen if the field is not compressed
CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
newException.initCause(e);
throw newException;
}
}
}