//Parse queries and find integer codes for the query terms.
HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);
HMapIV<int[]> queries = QueryUtility.queryToIntegerCode(env, parsedQueries);
Set<Integer> termidHistory = Sets.newHashSet();
HMapII docLengths = new HMapII();
SpamPercentileScore spamScores = new SpamPercentileScore();
spamScores.initialize(spamPath, fs);
int[] newDocids = DocumentUtility.spamSortDocids(spamScores);
Posting posting = new Posting();
List<TermPositions> positions = Lists.newArrayList();
Map<Integer, TermPositions> positionsMap = Maps.newHashMap();
for(int qid: queries.keySet()) {
for(int termid: queries.get(qid)) {
if(!termidHistory.contains(termid)) {
termidHistory.add(termid);
PostingsList pl = env.getPostingsList(env.getTermFromId(termid));
PostingsReader reader = pl.getPostingsReader();
positions.clear();
positionsMap.clear();
int[] data = new int[pl.getDf()];
int index = 0;
while (reader.nextPosting(posting)) {
data[index] = newDocids[posting.getDocno()];
positionsMap.put(data[index], new TermPositions(reader.getPositions(), reader.getTf()));
docLengths.put(data[index], env.getDocumentLength(posting.getDocno()));
index++;
}
Arrays.sort(data);
for(int i = 0; i < data.length; i++) {
positions.add(positionsMap.get(data[i]));
}
output.writeInt(termid);
output.writeInt(pl.getDf());
CompressedPositionalPostings.newInstance(data, positions).write(output);
}
}
LOGGER.info("Compressed query " + qid);
}
output.writeInt(-1);
output.writeInt(docLengths.size());
for(int docid: docLengths.keySet()) {
output.writeInt(docid);
output.writeInt(docLengths.get(docid));
}
output.close();
}