Package lucandra

Source Code of lucandra.TermFreqVector

/**
* Copyright T Jake Luciani
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package lucandra;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import lucandra.serializers.thrift.DocumentMetadata;
import lucandra.serializers.thrift.ThriftTerm;

import org.apache.cassandra.db.ReadCommand;
import org.apache.cassandra.db.Row;
import org.apache.cassandra.db.SliceByNamesReadCommand;
import org.apache.cassandra.thrift.ColumnParent;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectorOffsetInfo;

public class TermFreqVector implements org.apache.lucene.index.TermFreqVector,
        org.apache.lucene.index.TermPositionVector
{

    private String                   field;
    private byte[]                   docId;
    private String[]                 terms;
    private int[]                    freqVec;
    private int[][]                  termPositions;
    private TermVectorOffsetInfo[][] termOffsets;

    public TermFreqVector(String indexName, String field, int docI)
    {
        this.field = field;
       
        byte[] indexNameBytes = null;
        try
        {
            this.docId = Integer.toHexString(docI).getBytes("UTF-8");
            indexNameBytes = indexName.getBytes("UTF-8");
        }
        catch (UnsupportedEncodingException e1)
        {
           throw new RuntimeException(e1);
        }

        ByteBuffer key = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes, docId);

        ReadCommand rc = new SliceByNamesReadCommand(CassandraUtils.keySpace, key, CassandraUtils.metaColumnPath,
                Arrays.asList(CassandraUtils.documentMetaFieldBytes));


        List<Row> rows = null;
        try
        {

            rows = CassandraUtils.robustRead(CassandraUtils.consistency, rc);


            if (rows.isEmpty())
            {

                return; // this docId is missing
            }

          

            DocumentMetadata allTerms = IndexWriter.fromBytesUsingThrift(rows.get(0).cf.getColumn(
                    CassandraUtils.documentMetaFieldBytes).value());

            List<ReadCommand> readCommands = new ArrayList<ReadCommand>();

            for (ThriftTerm t : allTerms.getTerms())
            {
                // skip the ones not of this field
                if (!t.getField().equals(field))
                    continue;

                // add to multiget params
                try
                {
                    key = CassandraUtils.hashKeyBytes(indexName.getBytes("UTF-8"), CassandraUtils.delimeterBytes, t.getField()
                            .getBytes("UTF-8"), CassandraUtils.delimeterBytes, t.getText());
                }
                catch (UnsupportedEncodingException e)
                {
                    throw new RuntimeException("JVM doesn't support UTF-8", e);
                }

                readCommands.add(new SliceByNamesReadCommand(CassandraUtils.keySpace, key, new ColumnParent()
                        .setColumn_family(CassandraUtils.termVecColumnFamily), Arrays.asList(ByteBuffer
                        .wrap(CassandraUtils.writeVInt(docI)))));
            }

            rows = CassandraUtils.robustRead(CassandraUtils.consistency, readCommands.toArray(new ReadCommand[] {}));


        }
        catch (IOException e)
        {
            throw new RuntimeException(e);
        }
       
        terms = new String[rows.size()];
        freqVec = new int[rows.size()];
        termPositions = new int[rows.size()][];
        termOffsets = new TermVectorOffsetInfo[rows.size()][];

        int i = 0;

        try
        {
            for (Row row : rows)
            {
                String rowKey = ByteBufferUtil.string(row.key.key, CassandraUtils.UTF_8);

                String termStr = rowKey.substring(rowKey.indexOf(CassandraUtils.delimeter)
                        + CassandraUtils.delimeter.length());

                Term t = CassandraUtils.parseTerm(termStr);

                terms[i] = t.text();

                // Find the offsets and positions
                LucandraTermInfo termInfo = null;

                if (row.cf != null)
                {
                    ByteBuffer val = row.cf.getSortedColumns().iterator().next().value();
                   
                    termInfo = new LucandraTermInfo(0, val);

                    termPositions[i] = termInfo.positions;
                }
                              
                freqVec[i] = termInfo.freq;

                if (termInfo == null || !termInfo.hasOffsets)
                {
                    termOffsets[i] = null;
                }
                else
                {

                    int[] offsets = termInfo.offsets;

                    termOffsets[i] = new TermVectorOffsetInfo[freqVec[i]];
                    for (int j = 0, k = 0; j < offsets.length; j += 2, k++)
                    {
                        termOffsets[i][k] = new TermVectorOffsetInfo(offsets[j], offsets[j + 1]);
                    }
                }

                i++;
            }
        }
        catch (CharacterCodingException e)
        {
            throw new RuntimeException(e);

        }

    }

    public String getField()
    {
        return field;
    }

    public int[] getTermFrequencies()
    {
        return freqVec;
    }

    public String[] getTerms()
    {
        return terms;
    }

    public int indexOf(String term)
    {
        return Arrays.binarySearch(terms, term);
    }

    public int[] indexesOf(String[] terms, int start, int len)
    {
        int[] res = new int[terms.length];

        for (int i = 0; i < terms.length; i++)
        {
            res[i] = indexOf(terms[i]);
        }

        return res;
    }

    public int size()
    {
        return terms.length;
    }

    public TermVectorOffsetInfo[] getOffsets(int index)
    {
        return termOffsets[index];
    }

    public int[] getTermPositions(int index)
    {
        return termPositions[index];
    }

}
TOP

Related Classes of lucandra.TermFreqVector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.