/**
* Copyright T Jake Luciani
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package lucandra;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import lucandra.serializers.thrift.DocumentMetadata;
import lucandra.serializers.thrift.ThriftTerm;
import org.apache.cassandra.db.ReadCommand;
import org.apache.cassandra.db.Row;
import org.apache.cassandra.db.SliceByNamesReadCommand;
import org.apache.cassandra.thrift.ColumnParent;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectorOffsetInfo;
public class TermFreqVector implements org.apache.lucene.index.TermFreqVector,
org.apache.lucene.index.TermPositionVector
{
private String field;
private byte[] docId;
private String[] terms;
private int[] freqVec;
private int[][] termPositions;
private TermVectorOffsetInfo[][] termOffsets;
public TermFreqVector(String indexName, String field, int docI)
{
this.field = field;
byte[] indexNameBytes = null;
try
{
this.docId = Integer.toHexString(docI).getBytes("UTF-8");
indexNameBytes = indexName.getBytes("UTF-8");
}
catch (UnsupportedEncodingException e1)
{
throw new RuntimeException(e1);
}
ByteBuffer key = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes, docId);
ReadCommand rc = new SliceByNamesReadCommand(CassandraUtils.keySpace, key, CassandraUtils.metaColumnPath,
Arrays.asList(CassandraUtils.documentMetaFieldBytes));
List<Row> rows = null;
try
{
rows = CassandraUtils.robustRead(CassandraUtils.consistency, rc);
if (rows.isEmpty())
{
return; // this docId is missing
}
DocumentMetadata allTerms = IndexWriter.fromBytesUsingThrift(rows.get(0).cf.getColumn(
CassandraUtils.documentMetaFieldBytes).value());
List<ReadCommand> readCommands = new ArrayList<ReadCommand>();
for (ThriftTerm t : allTerms.getTerms())
{
// skip the ones not of this field
if (!t.getField().equals(field))
continue;
// add to multiget params
try
{
key = CassandraUtils.hashKeyBytes(indexName.getBytes("UTF-8"), CassandraUtils.delimeterBytes, t.getField()
.getBytes("UTF-8"), CassandraUtils.delimeterBytes, t.getText());
}
catch (UnsupportedEncodingException e)
{
throw new RuntimeException("JVM doesn't support UTF-8", e);
}
readCommands.add(new SliceByNamesReadCommand(CassandraUtils.keySpace, key, new ColumnParent()
.setColumn_family(CassandraUtils.termVecColumnFamily), Arrays.asList(ByteBuffer
.wrap(CassandraUtils.writeVInt(docI)))));
}
rows = CassandraUtils.robustRead(CassandraUtils.consistency, readCommands.toArray(new ReadCommand[] {}));
}
catch (IOException e)
{
throw new RuntimeException(e);
}
terms = new String[rows.size()];
freqVec = new int[rows.size()];
termPositions = new int[rows.size()][];
termOffsets = new TermVectorOffsetInfo[rows.size()][];
int i = 0;
try
{
for (Row row : rows)
{
String rowKey = ByteBufferUtil.string(row.key.key, CassandraUtils.UTF_8);
String termStr = rowKey.substring(rowKey.indexOf(CassandraUtils.delimeter)
+ CassandraUtils.delimeter.length());
Term t = CassandraUtils.parseTerm(termStr);
terms[i] = t.text();
// Find the offsets and positions
LucandraTermInfo termInfo = null;
if (row.cf != null)
{
ByteBuffer val = row.cf.getSortedColumns().iterator().next().value();
termInfo = new LucandraTermInfo(0, val);
termPositions[i] = termInfo.positions;
}
freqVec[i] = termInfo.freq;
if (termInfo == null || !termInfo.hasOffsets)
{
termOffsets[i] = null;
}
else
{
int[] offsets = termInfo.offsets;
termOffsets[i] = new TermVectorOffsetInfo[freqVec[i]];
for (int j = 0, k = 0; j < offsets.length; j += 2, k++)
{
termOffsets[i][k] = new TermVectorOffsetInfo(offsets[j], offsets[j + 1]);
}
}
i++;
}
}
catch (CharacterCodingException e)
{
throw new RuntimeException(e);
}
}
public String getField()
{
return field;
}
public int[] getTermFrequencies()
{
return freqVec;
}
public String[] getTerms()
{
return terms;
}
public int indexOf(String term)
{
return Arrays.binarySearch(terms, term);
}
public int[] indexesOf(String[] terms, int start, int len)
{
int[] res = new int[terms.length];
for (int i = 0; i < terms.length; i++)
{
res[i] = indexOf(terms[i]);
}
return res;
}
public int size()
{
return terms.length;
}
public TermVectorOffsetInfo[] getOffsets(int index)
{
return termOffsets[index];
}
public int[] getTermPositions(int index)
{
return termPositions[index];
}
}