Package org.apache.mahout.utils.vectors.lucene

Source Code of org.apache.mahout.utils.vectors.lucene.LuceneIterable$TDIterator

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.utils.vectors.lucene;

import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.mahout.matrix.Vector;

import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;

/**
* A LuceneIterable is an Iterable<Vector> that uses a Lucene index as the source for creating the {@link org.apache.mahout.matrix.Vector}.
* The Field used to create the Vector currently must have Term Vectors stored for it.
*/
public class LuceneIterable implements Iterable<Vector> {

  private IndexReader indexReader;
  private String field;
  private String idField;
  private FieldSelector idFieldSelector;

  private VectorMapper mapper;
  private double normPower = NO_NORMALIZING;

  public static final double NO_NORMALIZING = -1.0;

  public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper) {
    this(reader, idField, field, mapper, NO_NORMALIZING);
  }

  /**
   * Produce a LuceneIterable that can create the Vector plus normalize it.
   *
   * @param reader    The {@link org.apache.lucene.index.IndexReader} to read the documents from.
   * @param idField   - The Field containing the id.  May be null
   * @param field     The field to use for the Vector
   * @param mapper    The {@link org.apache.mahout.utils.vectors.lucene.VectorMapper} for creating {@link org.apache.mahout.matrix.Vector}s from Lucene's TermVectors.
   * @param normPower The normalization value.  Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
   */
  public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper, double normPower) {
    if (normPower != NO_NORMALIZING && normPower < 0) {
      throw new IllegalArgumentException("normPower must either be -1 or >= 0");
    }
    idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
    this.indexReader = reader;
    this.idField = idField;
    this.field = field;
    this.mapper = mapper;
    this.normPower = normPower;
  }


  @Override
  public Iterator<Vector> iterator() {
    try {
      return new TDIterator();
    } catch (IOException e) {
      throw new IllegalStateException(e);
    }
  }

  private class TDIterator implements Iterator<Vector> {
    private final TermDocs termDocs;

    private TDIterator() throws IOException {
      //term docs(null) is a better way of iterating all the docs in Lucene
      this.termDocs = indexReader.termDocs(null);
    }

    @Override
    public boolean hasNext() {
      // TODO this doesn't work with the Iterator contract -- hasNext() cannot have a side effect     
      try {
        return termDocs.next();
      } catch (IOException e) {
        throw new IllegalStateException(e);
      }
    }

    @Override
    public Vector next() {
      Vector result;
      int doc = termDocs.doc();
      //
      try {
        indexReader.getTermFreqVector(doc, field, mapper);
        result = mapper.getVector();
        if (idField != null) {
          String id = indexReader.document(doc, idFieldSelector).get(idField);
          result.setName(id);
        } else {
          result.setName(String.valueOf(doc));
        }
        if (normPower != NO_NORMALIZING) {
          result = result.normalize(normPower);
        }
      } catch (IOException e) {
        //Log?
        throw new IllegalStateException(e);
      }

      return result;
    }


    @Override
    public void remove() {
      throw new UnsupportedOperationException();
    }

  }


}
TOP

Related Classes of org.apache.mahout.utils.vectors.lucene.LuceneIterable$TDIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.