Package ivory.bloomir.data

Source Code of ivory.bloomir.data.SignatureIO

package ivory.bloomir.data;

import java.io.EOFException;
import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

import ivory.bloomir.util.DocumentUtility;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.index.Posting;
import ivory.core.data.index.PostingsList;
import ivory.core.data.index.PostingsReader;
import ivory.core.data.stat.SpamPercentileScore;

public class SignatureIO {
  private static final Logger LOGGER = Logger.getLogger(SignatureIO.class);

  /**
   * Loads an entire collection of signatures into an array.
   *
   * @param path Path to the root of the postings list
   * @param fs File system
   * @param signatures Array of {@link Signature} to be initialized.
   */
  public static void loadSignatures(String path, FileSystem fs, Signature[] signatures) throws IOException {
    FSDataInputStream input = fs.open(new Path(path + "/" + BloomConfig.CONFIG_FILE));
    BloomConfig bloomConfig = BloomConfig.readInstance(input);
    input.close();

    FileStatus[] stat = fs.listStatus(new Path(path));
    for(int f = 0; f < stat.length; f++) {
      String name = stat[f].getPath().toString();
      name = name.substring(name.lastIndexOf('/') + 1);

      if(name.equals(BloomConfig.CONFIG_FILE)) {
        continue;
      }

      LOGGER.info("reading block: " + name);
      input = fs.open(stat[f].getPath());

      while(true) {
        try {
          int id = input.readInt();
          int df = input.readInt();
          if(df <= bloomConfig.getIdentityHashThreshold()) {
            signatures[id] = BloomFilterHash.readInstance(input);
          } else {
            signatures[id] = BloomFilterIdentityHash.readInstance(input);
          }
        } catch(EOFException ex) {
          break;
        }
      }
      input.close();
    }
  }

  /**
   * Creates Signatures and writes them to disk.
   *
   * @param outputPath Root path to store the output in.
   * @param fs File system
   * @param env Retireval Environment
   * @param spamScoresPath Path to spam/quality scores
   * @param bitsPerElement Number of bits dedicated to one element
   * @param nbHash Number of hash functions
   */
  public static void writeSignatures(String outputPath, FileSystem fs, RetrievalEnvironment env,
                                     String spamScoresPath, int bitsPerElement, int nbHash) throws IOException {
    SpamPercentileScore spamScores = new SpamPercentileScore();
    spamScores.initialize(spamScoresPath, fs);
    int[] newDocids = DocumentUtility.spamSortDocids(spamScores);

    int collectionSize = env.readCollectionTermCount();
    Posting posting = new Posting();
    FSDataOutputStream out;

    BloomConfig bloomConfig =  new BloomConfig((int) env.getDocumentCount(),
                                               collectionSize, nbHash, bitsPerElement);
    //Deletes the output path if it already exists.
    fs.delete(new Path(outputPath));

    //Serialize and write the configuration parameters.
    out = fs.create(new Path(outputPath + "/" + BloomConfig.CONFIG_FILE));
    bloomConfig.write(out);
    out.close();

    for(int i = 0; i <= collectionSize; i++) {
      if(i % 100000 == 0) {
        if(i != 0) {
          out.close();
        }
        out = fs.create(new Path(outputPath + "/" + i));
      }

      try {
        PostingsList pl = env.getPostingsList(env.getTermFromId(i));
        PostingsReader reader = pl.getPostingsReader();
        Signature filter = null;

        //Decide which filter to use based on the configuration parameters
        int df = pl.getDf();
        if (df <= bloomConfig.getIdentityHashThreshold()) {
          filter = new BloomFilterHash(df * bloomConfig.getBitsPerElement(),
                                       bloomConfig.getHashCount());
        } else {
          filter = new BloomFilterIdentityHash(bloomConfig.getDocumentCount());
        }

        while (reader.nextPosting(posting)) {
          filter.add(newDocids[posting.getDocno()]);
        }

        out.writeInt(i);
        out.writeInt(df);
        filter.write(out);
      } catch(Exception e) {
        continue;
      }
    }

    if(out != null) {
      out.close();
    }
  }
}
TOP

Related Classes of ivory.bloomir.data.SignatureIO

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.