Package ivory.core.data.stat

Source Code of ivory.core.data.stat.PrefixEncodedGlobalStats

/*
* Ivory: A Hadoop toolkit for Web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.core.data.stat;

import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.PrefixEncodedLexicographicallySortedDictionary;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.log4j.Logger;


import edu.umd.cloud9.io.pair.PairOfIntLong;

public class PrefixEncodedGlobalStats {
  private static final Logger LOG = Logger.getLogger(PrefixEncodedGlobalStats.class);

   PrefixEncodedLexicographicallySortedDictionary prefixSet =
       new PrefixEncodedLexicographicallySortedDictionary();

  Configuration conf = new Configuration();
  FileSystem fileSys = FileSystem.get(conf);
  int[] df = null;
  long[] cf = null;

  FSDataInputStream termsInput = null;
  FSDataInputStream dfStatsInput = null;
  FSDataInputStream cfStatsInput = null;

  public PrefixEncodedGlobalStats(Path prefixSetPath) throws IOException {
    termsInput = fileSys.open(prefixSetPath);

    prefixSet.readFields(termsInput);
    termsInput.close();
  }

  public PrefixEncodedGlobalStats(Path prefixSetPath, FileSystem fs) throws IOException {
    fileSys = fs;
    termsInput = fileSys.open(prefixSetPath);

    prefixSet.readFields(termsInput);
    termsInput.close();
  }

  public void loadDFStats(Path dfStatsPath) throws IOException {
    loadDFStats(dfStatsPath, fileSys);
  }
 
  public void loadDFStats(Path dfStatsPath, FileSystem fs) throws IOException {
    dfStatsInput = fs.open(dfStatsPath);

    int l = dfStatsInput.readInt();
    if (l != prefixSet.size()) {
      throw new RuntimeException("df length mismatch: " + l + "\t" + prefixSet.size());
    }
    df = new int[l];
    for (int i = 0; i < l; i++)
      //df[i] = dfStatsInput.readInt();
      df[i] = WritableUtils.readVInt(dfStatsInput);
   
    dfStatsInput.close();
  }

  public void loadCFStats(Path cfStatsPath) throws IOException {
    loadCFStats(cfStatsPath, fileSys);
  }
 
  public void loadCFStats(Path cfStatsPath, FileSystem fs) throws IOException {
    cfStatsInput = fs.open(cfStatsPath);

    int l = cfStatsInput.readInt();
    if (l != prefixSet.size()) {
      throw new RuntimeException("cf length mismatch: " + l + "\t" + prefixSet.size());
    }
    cf = new long[l];
    for (int i = 0; i < l; i++)
      //cf[i] = cfStatsInput.readLong();
      cf[i] = WritableUtils.readVLong(cfStatsInput);
    cfStatsInput.close();
  }

  public int getDF(String term) {
    if(df == null)
      throw new RuntimeException("DF-Stats must be loaded first!");
    int index = prefixSet.getId(term);
    LOG.info("index of " + term + ": " + index);
    if (index < 0)
      return -1;
    return df[index];
  }

  public long getCF(String term) {
    if(cf == null)
      throw new RuntimeException("CF-Stats must be loaded first!");
    int index = prefixSet.getId(term);
    LOG.info("index of " + term + ": " + index);
    if (index < 0)
      return -1;
    return cf[index];
  }

  public PairOfIntLong getStats(String term) {
    int index = prefixSet.getId(term);
    LOG.info("index of " + term + ": " + index);
    if (index < 0)
      return null;
    PairOfIntLong p = new PairOfIntLong();
    p.set(df[index], cf[index]);
    return p;
  }
 
  public PairOfIntLong getStats(int index) {
    if (index < 0)
      return null;
    PairOfIntLong p = new PairOfIntLong();
    p.set(df[index], cf[index]);
    return p;
  }

  public int length() {
    return prefixSet.size();
  }

  public void printKeys() {
    System.out.println("Window: " + this.prefixSet.getWindowSize());
    System.out.println("Length: " + this.length());
    // int window = prefixSet.getWindow();
    for (int i = 0; i < length() && i < 100; i++) {
      System.out.print(i + "\t" + prefixSet.getTerm(i));
      if (df != null)
        System.out.print("\t" + df[i]);
      if (cf != null)
        System.out.print("\t" + cf[i]);
      System.out.println();
    }
  }

  /*
   * public void printPrefixSetContent(){ prefixSet.printCompressedKeys();
   * prefixSet.printKeys(); }
   */
  public static void main(String[] args) throws Exception{
    //String indexPath = "/umd-lin/telsayed/indexes/medline04";
    String indexPath = "c:/Research/ivory-workspace";

    Configuration conf = new Configuration();
    FileSystem fileSys= FileSystem.getLocal(conf);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fileSys);
   
    Path termsFilePath = new Path(env.getIndexTermsData());
   
    Path dfByTermFilePath = new Path(env.getDfByTermData());
    Path cfByTermFilePath = new Path(env.getCfByTermData());

    System.out.println("PrefixEncodedGlobalStats");
    PrefixEncodedGlobalStats globalStatsMap = new PrefixEncodedGlobalStats(termsFilePath);
    System.out.println("PrefixEncodedGlobalStats1");
    globalStatsMap.loadDFStats(dfByTermFilePath);
    System.out.println("PrefixEncodedGlobalStats2");
    globalStatsMap.loadCFStats(cfByTermFilePath);
    System.out.println("PrefixEncodedGlobalStats3");
    //String[] firstKeys = termIDMap.getDictionary().getFirstKeys(100);
    int nTerms = globalStatsMap.length();
    System.out.println("nTerms: "+nTerms);
    /*for(int i = 0; i < nTerms; i++){
     
      PairOfIntLong p = globalStatsMap.getStats(i);
      System.out.println(i+"\t"+p.getLeftElement() +"\t"+ p.getRightElement());
      //if(i%10000 == 0) System.out.println(i+" terms so far ("+p+").");
    }*/
    String term;
    term = "0046";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    term = "00565";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    term = "01338";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    term = "01hz";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    term = "03x";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    term = "0278x";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
   
    term = "0081";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    term = "0183";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    term = "0244";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    term = "032";  System.out.println(term+"\t"+globalStatsMap.getDF(term));
    //for(int i = 1; i<=200; i++){
    //  term = termIDMap.getTerm(i);
    //  System.out.println(i+"\t"+term+"\t"+termIDMap.getID(term));
    //}
  }
}
TOP

Related Classes of ivory.core.data.stat.PrefixEncodedGlobalStats

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.