/*
* Ivory: A Hadoop toolkit for Web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package ivory.core.data.stat;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.PrefixEncodedLexicographicallySortedDictionary;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.log4j.Logger;
import edu.umd.cloud9.io.pair.PairOfIntLong;
public class PrefixEncodedGlobalStats {
private static final Logger LOG = Logger.getLogger(PrefixEncodedGlobalStats.class);
PrefixEncodedLexicographicallySortedDictionary prefixSet =
new PrefixEncodedLexicographicallySortedDictionary();
Configuration conf = new Configuration();
FileSystem fileSys = FileSystem.get(conf);
int[] df = null;
long[] cf = null;
FSDataInputStream termsInput = null;
FSDataInputStream dfStatsInput = null;
FSDataInputStream cfStatsInput = null;
public PrefixEncodedGlobalStats(Path prefixSetPath) throws IOException {
termsInput = fileSys.open(prefixSetPath);
prefixSet.readFields(termsInput);
termsInput.close();
}
public PrefixEncodedGlobalStats(Path prefixSetPath, FileSystem fs) throws IOException {
fileSys = fs;
termsInput = fileSys.open(prefixSetPath);
prefixSet.readFields(termsInput);
termsInput.close();
}
public void loadDFStats(Path dfStatsPath) throws IOException {
loadDFStats(dfStatsPath, fileSys);
}
public void loadDFStats(Path dfStatsPath, FileSystem fs) throws IOException {
dfStatsInput = fs.open(dfStatsPath);
int l = dfStatsInput.readInt();
if (l != prefixSet.size()) {
throw new RuntimeException("df length mismatch: " + l + "\t" + prefixSet.size());
}
df = new int[l];
for (int i = 0; i < l; i++)
//df[i] = dfStatsInput.readInt();
df[i] = WritableUtils.readVInt(dfStatsInput);
dfStatsInput.close();
}
public void loadCFStats(Path cfStatsPath) throws IOException {
loadCFStats(cfStatsPath, fileSys);
}
public void loadCFStats(Path cfStatsPath, FileSystem fs) throws IOException {
cfStatsInput = fs.open(cfStatsPath);
int l = cfStatsInput.readInt();
if (l != prefixSet.size()) {
throw new RuntimeException("cf length mismatch: " + l + "\t" + prefixSet.size());
}
cf = new long[l];
for (int i = 0; i < l; i++)
//cf[i] = cfStatsInput.readLong();
cf[i] = WritableUtils.readVLong(cfStatsInput);
cfStatsInput.close();
}
public int getDF(String term) {
if(df == null)
throw new RuntimeException("DF-Stats must be loaded first!");
int index = prefixSet.getId(term);
LOG.info("index of " + term + ": " + index);
if (index < 0)
return -1;
return df[index];
}
public long getCF(String term) {
if(cf == null)
throw new RuntimeException("CF-Stats must be loaded first!");
int index = prefixSet.getId(term);
LOG.info("index of " + term + ": " + index);
if (index < 0)
return -1;
return cf[index];
}
public PairOfIntLong getStats(String term) {
int index = prefixSet.getId(term);
LOG.info("index of " + term + ": " + index);
if (index < 0)
return null;
PairOfIntLong p = new PairOfIntLong();
p.set(df[index], cf[index]);
return p;
}
public PairOfIntLong getStats(int index) {
if (index < 0)
return null;
PairOfIntLong p = new PairOfIntLong();
p.set(df[index], cf[index]);
return p;
}
public int length() {
return prefixSet.size();
}
public void printKeys() {
System.out.println("Window: " + this.prefixSet.getWindowSize());
System.out.println("Length: " + this.length());
// int window = prefixSet.getWindow();
for (int i = 0; i < length() && i < 100; i++) {
System.out.print(i + "\t" + prefixSet.getTerm(i));
if (df != null)
System.out.print("\t" + df[i]);
if (cf != null)
System.out.print("\t" + cf[i]);
System.out.println();
}
}
/*
* public void printPrefixSetContent(){ prefixSet.printCompressedKeys();
* prefixSet.printKeys(); }
*/
public static void main(String[] args) throws Exception{
//String indexPath = "/umd-lin/telsayed/indexes/medline04";
String indexPath = "c:/Research/ivory-workspace";
Configuration conf = new Configuration();
FileSystem fileSys= FileSystem.getLocal(conf);
RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fileSys);
Path termsFilePath = new Path(env.getIndexTermsData());
Path dfByTermFilePath = new Path(env.getDfByTermData());
Path cfByTermFilePath = new Path(env.getCfByTermData());
System.out.println("PrefixEncodedGlobalStats");
PrefixEncodedGlobalStats globalStatsMap = new PrefixEncodedGlobalStats(termsFilePath);
System.out.println("PrefixEncodedGlobalStats1");
globalStatsMap.loadDFStats(dfByTermFilePath);
System.out.println("PrefixEncodedGlobalStats2");
globalStatsMap.loadCFStats(cfByTermFilePath);
System.out.println("PrefixEncodedGlobalStats3");
//String[] firstKeys = termIDMap.getDictionary().getFirstKeys(100);
int nTerms = globalStatsMap.length();
System.out.println("nTerms: "+nTerms);
/*for(int i = 0; i < nTerms; i++){
PairOfIntLong p = globalStatsMap.getStats(i);
System.out.println(i+"\t"+p.getLeftElement() +"\t"+ p.getRightElement());
//if(i%10000 == 0) System.out.println(i+" terms so far ("+p+").");
}*/
String term;
term = "0046"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "00565"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "01338"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "01hz"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "03x"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "0278x"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "0081"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "0183"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "0244"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
term = "032"; System.out.println(term+"\t"+globalStatsMap.getDF(term));
//for(int i = 1; i<=200; i++){
// term = termIDMap.getTerm(i);
// System.out.println(i+"\t"+term+"\t"+termIDMap.getID(term));
//}
}
}