package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.DataOutput.ByteIndexInput;
import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.CollectionUtil;
final class FreqProxTermsWriter extends TermsHashConsumer {
@Override
public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) {
return new FreqProxTermsWriterPerThread(perThread);
}
private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
while(true) {
final char c1 = text1[pos1++];
final char c2 = text2[pos2++];
if (c1 != c2) {
if (0xffff == c2)
return 1;
else if (0xffff == c1)
return -1;
else
return c1-c2;
} else if (0xffff == c1)
return 0;
}
}
@Override
void abort() {}
// TODO: would be nice to factor out more of this, eg the
// FreqProxFieldMergeState, and code to visit all Fields
// under the same FieldInfo together, up into TermsHash*.
// Other writers would presumably share alot of this...
@Override
public void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException {
// Gather all FieldData's that have postings, across all
// ThreadStates
List<FreqProxTermsWriterPerField> allFields = new ArrayList<FreqProxTermsWriterPerField>();
for (Map.Entry<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> entry : threadsAndFields.entrySet()) {
Collection<TermsHashConsumerPerField> fields = entry.getValue();
for (final TermsHashConsumerPerField i : fields) {
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i;
if (perField.termsHashPerField.numPostings > 0)
allFields.add(perField);
}
}
// Sort by field name
CollectionUtil.quickSort(allFields);
final int numAllFields = allFields.size();
// TODO: allow Lucene user to customize this consumer:
final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
/*
Current writer chain:
FormatPostingsFieldsConsumer
-> IMPL: FormatPostingsFieldsWriter
-> FormatPostingsTermsConsumer
-> IMPL: FormatPostingsTermsWriter
-> FormatPostingsDocConsumer
-> IMPL: FormatPostingsDocWriter
-> FormatPostingsPositionsConsumer
-> IMPL: FormatPostingsPositionsWriter
*/
try {
int start = 0;
while(start < numAllFields) {
final FieldInfo fieldInfo = allFields.get(start).fieldInfo;
final String fieldName = fieldInfo.name;
int end = start+1;
while(end < numAllFields && allFields.get(end).fieldInfo.name.equals(fieldName))
end++;
FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start];
for(int i=start;i<end;i++) {
fields[i-start] = allFields.get(i);
// Aggregate the storePayload as seen by the same
// field across multiple threads
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
fieldInfo.storePayloads |= fields[i-start].hasPayloads;
}
}
// If this field has postings then add them to the
// segment
appendPostings(fieldName, state, fields, consumer);
for(int i=0;i<fields.length;i++) {
TermsHashPerField perField = fields[i].termsHashPerField;
int numPostings = perField.numPostings;
perField.reset();
perField.shrinkHash(numPostings);
fields[i].reset();
}
start = end;
}
for (Map.Entry<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> entry : threadsAndFields.entrySet()) {
FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey();
perThread.termsHashPerThread.reset(true);
}
} finally {
consumer.finish();
}
}
private byte[] payloadBuffer;
/* Walk through all unique text tokens (Posting
* instances) found in this field and serialize them
* into a single RAM segment. */
void appendPostings(String fieldName, SegmentWriteState state,
FreqProxTermsWriterPerField[] fields,
FormatPostingsFieldsConsumer consumer)
throws CorruptIndexException, IOException {
int numFields = fields.length;
final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
for(int i=0;i<numFields;i++) {
FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);
assert fms.field.fieldInfo == fields[0].fieldInfo;
// Should always be true
boolean result = fms.nextTerm();
assert result;
}
final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
final Term protoTerm = new Term(fieldName);
FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
final IndexOptions currentFieldIndexOptions = fields[0].fieldInfo.indexOptions;
final Map<Term,Integer> segDeletes;
if (state.segDeletes != null && state.segDeletes.terms.size() > 0) {
segDeletes = state.segDeletes.terms;
} else {
segDeletes = null;
}
try {
// TODO: really TermsHashPerField should take over most
// of this loop, including merge sort of terms from
// multiple threads and interacting with the
// TermsConsumer, only calling out to us (passing us the
// DocsConsumer) to handle delivery of docs/positions
while(numFields > 0) {
// Get the next term to merge
termStates[0] = mergeStates[0];
int numToMerge = 1;
// TODO: pqueue
for(int i=1;i<numFields;i++) {
final char[] text = mergeStates[i].text;
final int textOffset = mergeStates[i].textOffset;
final int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);
if (cmp < 0) {
termStates[0] = mergeStates[i];
numToMerge = 1;
} else if (cmp == 0)
termStates[numToMerge++] = mergeStates[i];
}
Term term=protoTerm.createTerm(termStates[0].termText());
final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(term,termStates[0].text, termStates[0].textOffset);
final int delDocLimit;
if (segDeletes != null) {
final Integer docIDUpto = segDeletes.get(term);
if (docIDUpto != null) {
delDocLimit = docIDUpto;
} else {
delDocLimit = 0;
}
} else {
delDocLimit = 0;
}
docConsumer.reset();
docConsumer.startTerm();
while (numToMerge > 0) {
FreqProxFieldMergeState minState = termStates[0];
for (int i = 1; i < numToMerge; i++)
if (termStates[i].docID < minState.docID)
minState = termStates[i];
final int termDocFreq = minState.termFreq;
final FormatPostingsPositionsConsumer posConsumer = docConsumer
.addDoc(minState.docID, termDocFreq);
if (minState.docID < delDocLimit) {
if (state.deletedDocs == null) {
state.deletedDocs = new BitVector(state.numDocs);
}
state.deletedDocs.set(minState.docID);
}
final ByteSliceReader prox = minState.prox;
if (currentFieldIndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
try {
int position = 0;
for (int j = 0; j < termDocFreq; j++) {
final int code = prox.readVInt();
position += code >> 1;
final int payloadLength;
if ((code & 1) != 0) {
// This position has a payload
payloadLength = prox.readVInt();
if (payloadBuffer == null
|| payloadBuffer.length < payloadLength)
payloadBuffer = new byte[payloadLength];
prox.readBytes(payloadBuffer, 0,
payloadLength);
} else
payloadLength = 0;
posConsumer.addPosition(position,
payloadBuffer, 0, payloadLength);
} // End for
} finally {
posConsumer.finish();
}
}
if (!minState.nextDoc()) {
int upto = 0;
for (int i = 0; i < numToMerge; i++)
if (termStates[i] != minState)
termStates[upto++] = termStates[i];
numToMerge--;
assert upto == numToMerge;
if (!minState.nextTerm()) {
upto = 0;
for (int i = 0; i < numFields; i++)
if (mergeStates[i] != minState)
mergeStates[upto++] = mergeStates[i];
numFields--;
assert upto == numFields;
}
}
}
docConsumer.finish();
}
} finally {
termsConsumer.finish();
}
}
final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result();
}