Package net.bpiwowar.mg4j.extensions.conf

Source Code of net.bpiwowar.mg4j.extensions.conf.IndexConfiguration

/**
*
*/
package net.bpiwowar.mg4j.extensions.conf;

import bpiwowar.argparser.Argument;
import bpiwowar.argparser.checkers.IOChecker.ValidDirectory;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.Index;
import it.unimi.dsi.fastutil.ints.IntBigList;
import it.unimi.dsi.fastutil.longs.LongBigArrayBigList;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.ObjectBigList;
import it.unimi.dsi.io.InputBitStream;
import org.apache.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.lang.ref.SoftReference;

final public class IndexConfiguration {
  final static private Logger LOGGER = Logger.getLogger(IndexConfiguration.class);

  @Argument(name = "dir", help = "Index directory", checkers = ValidDirectory.class, required = true)
  public File directory;
  @Argument(name = "basename", help = "Index basename")
  public String basename = "index";
  @Argument(name = "field", help = "Field to use (by default \"text\")")
  public String field = "text";

  transient public Index index;

    private long unknownTermId;

    public IndexConfiguration() {
  }

    /** Returns the ID of an unknown term */
    public long getUnknownTermId() {
        checkTermMap(false);
        return unknownTermId;
    }

    /**
   *
   * @param directory
   *            index directory
   * @param basename
   *            index basename
   * @param field
   *            index field (e.g., "text")
   * @throws Exception
   */
  public IndexConfiguration(File directory, String basename, String field)
      throws Exception {
    this.directory = directory;
    this.basename = basename;
    this.field = field;
    init();
  }

  /**
   * Initialise the index
   */
  public Index init() throws Exception {
        if (index != null)
            return index;

    return index = Index.getInstance(
        new File(directory, String.format("%s-%s", basename, field))
            .toString(), true, true);
  }

  it.unimi.dsi.big.util.StringMap<? extends CharSequence> termMap;
  private ObjectBigList<? extends CharSequence> list;

  /**
   * Return the total length of the documents
   *
   * @return
   */
  public long getNumberOfPostings() {
        return index.numberOfPostings;
  }

  /**
   * Return the size of a document
   */
  public double getSize(int docId) {
    if (index.sizes == null)
      return -1;

    return index.sizes.get(docId);
  }

  /**
   * Get a term id for a given word
   *
   * @param word
   * @return
   */
  public long getTermId(CharSequence word) {
    checkTermMap(false);

    return termMap.getLong(word);
  }



  private void checkTermMap(boolean getList) {
    if (termMap == null) {
            termMap = index.termMap;
            unknownTermId = termMap.defaultReturnValue();
    }

    if (getList && list == null)
      list = termMap.list();
  }

  public ObjectBigList<? extends CharSequence> getTerms() {
    checkTermMap(true);
    return list;
  }

  /**
   * Get term
   * @param i
   * @return
   */
  public CharSequence getTerm(long i) {
    checkTermMap(true);
    return list.get(i);
  }

  /** Weak reference to document frequencies */
  SoftReference<IntBigList> frequencies = new SoftReference<IntBigList>(null);

  /**
   * Get document frequencies (i.e., number of documents in which a term
   * appears)
   *
   * @return
   * @throws java.io.IOException
   */
  public IntBigList getFrequencies() throws IOException {
    IntBigList list = frequencies.get();
    if (list == null) {
      File frequenciesFile = new File(directory, String.format("%s-%s%s",
          basename, field, DiskBasedIndex.FREQUENCIES_EXTENSION));
      LOGGER.info("Loading term frequencies from file "+
          frequenciesFile);
      list = DiskBasedIndex.readSizes(frequenciesFile.toString(),
          index.numberOfTerms);
      frequencies = new SoftReference<>(list);
    }
    return list;

  }

  /** Weak reference to document frequencies */
  SoftReference<LongBigList> termfrequencies = new SoftReference<>(null);
 
  /**
   * Get term frequencies (i.e. the number of times a term occurs in the
   * whole index).
   *
   * @return the term frequencies as an array which is parallel to the term
   *     ids
   * @throws java.io.IOException
   */
  public LongBigList getTermFrequency() throws IOException {
        LongBigList list = termfrequencies.get();
   
    if (list == null) {
      File frequenciesFile = new File(directory, String.format("%s-%s%s",
          basename, field, DiskBasedIndex.COUNTS_EXTENSION));
      LOGGER.info("Loading term frequencies from file " +
          frequenciesFile);
      list = new LongBigArrayBigList(index.numberOfTerms);

      final InputBitStream in = new InputBitStream(frequenciesFile);
      for (long i = 0; i < list.size64(); i++)
        list.set(i, in.readLongGamma());
      in.close();
      termfrequencies = new SoftReference<>(list);
     
      LOGGER.info("Completed.");
    }
    return list;

  }
}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.conf.IndexConfiguration

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.